Compare commits

...

54 Commits

Author SHA1 Message Date
Louis Dureuil
6417884969 remove v1_17 2025-08-18 16:48:37 +02:00
Louis Dureuil
d68e6c355b update to v1.22 2025-08-18 16:46:46 +02:00
Clément Renault
c33d2558c1 Make clippy happy 2025-08-18 16:41:51 +02:00
Clément Renault
94e4ef65d6 Switch to hannoy 0.0.4 2025-08-18 16:41:51 +02:00
Kerollmops
246290ef05 Make cargo insta happy 2025-08-18 16:41:51 +02:00
Kerollmops
e2803da5bc Remove the vector_store reference 2025-08-18 16:41:51 +02:00
Clément Renault
3c59702e70 Depend on Hannoy from crates.io 2025-08-18 16:41:51 +02:00
Mubelotix
d08b62db7d Move code to the right file 2025-08-18 16:41:48 +02:00
Clément Renault
27d2cd7bd2 Make cargo fmt happy 2025-08-18 16:38:13 +02:00
Clément Renault
619900e4d3 Make clippy happy 2025-08-18 16:38:13 +02:00
Clément Renault
24b017e367 Dispatch the vector store based on the index version 2025-08-18 16:38:13 +02:00
Clément Renault
e64852208c Make the VectorStore aware of the index version 2025-08-18 16:38:13 +02:00
Clément Renault
e4b28464fd Expose Hannoy progress when upgrading 2025-08-18 16:38:13 +02:00
Louis Dureuil
de90455809 First version of Hannoy dumpless upgrade 2025-08-18 16:37:38 +02:00
Clément Renault
ca5dc1b032 Integrate the hannoy progress 2025-08-18 16:23:05 +02:00
Kerollmops
5c464e9855 wip: Use Hamming when binary quantized 2025-08-18 16:23:05 +02:00
Clément Renault
114d50dfba Always use at least an ef = 100 when searching 2025-08-18 16:23:05 +02:00
Clément Renault
6f0249cffc Switch to hannoy with support for deletions 2025-08-18 16:23:05 +02:00
Kerollmops
e000df8646 Add a missing cancelation call for hannoy 2025-08-18 16:23:05 +02:00
Kerollmops
900e8a0d9c Reintroduce changing the distance from Cosine to Cosine binary quantized 2025-08-18 16:23:05 +02:00
Kerollmops
f5f173e451 Bump Hannoy's version 2025-08-18 16:23:05 +02:00
Clément Renault
ce9d56377c Increase efSearch from x2 to x10 2025-08-18 16:23:05 +02:00
Clément Renault
c2d912645f Increase efConstruction from 48 to 125 2025-08-18 16:23:05 +02:00
Clément Renault
112d3f54e9 remove-me: Introduce an env var to change the embeddings chunk size 2025-08-18 16:23:05 +02:00
Kerollmops
58a88c7933 Bump hannoy 2025-08-18 16:23:05 +02:00
Kerollmops
203418ae49 Use a more feature-full Hannoy version 2025-08-18 16:23:05 +02:00
Clément Renault
3bc192ae52 Reintroduce arroy and support for dumpless upgrade from previous versions 2025-08-18 16:23:04 +02:00
Clément Renault
db9f205184 Rename the vector store const name and keep the vector-arroy db name 2025-08-18 16:23:04 +02:00
Clément Renault
4d3a9dc43e Rename the ArroyWrapper/HannoyWrapper into VectorStore 2025-08-18 16:23:04 +02:00
Clément Renault
9a2a40a4fa Use constants as the hannoy default parameters 2025-08-18 16:23:04 +02:00
Kerollmops
bf921e9135 Use Hannoy instead of arroy 2025-08-18 16:23:04 +02:00
Louis Dureuil
eda77aeb1a Fix snapshots 2025-08-18 16:23:04 +02:00
Louis Dureuil
1fed0bed18 Remove erroneous untagged annotation 2025-08-18 16:23:04 +02:00
Louis Dureuil
658023e01b camel case the fields in "origin" 2025-08-18 16:23:04 +02:00
Louis Dureuil
b9c86e721f Rename Body::with_file 2025-08-18 16:23:04 +02:00
Louis Dureuil
a29497f720 Adjust timeouts 2025-08-18 16:23:04 +02:00
Louis Dureuil
1ae1856ec2 Don't always hardcode Content-Type in proxy 2025-08-18 16:23:03 +02:00
Louis Dureuil
184e9f72c1 Update snap 2025-08-18 16:23:03 +02:00
Louis Dureuil
4220c877e1 Misc churn 2025-08-18 16:23:03 +02:00
Louis Dureuil
dcd6951a0b Move meilisearch_types::Network to its own module 2025-08-18 16:23:03 +02:00
Louis Dureuil
1258bdb2b9 Make types Serialize and Deserialize for proxying 2025-08-18 16:23:03 +02:00
Louis Dureuil
08f15cdf4b New errors 2025-08-18 16:23:03 +02:00
Louis Dureuil
85e3267490 Dependency changes 2025-08-18 16:23:03 +02:00
Louis Dureuil
753db805a8 IndexScheduler::update_task now merges the task.network and accepts &mut Task 2025-08-18 16:23:03 +02:00
Louis Dureuil
bb52a8683a IndexScheduler::set_task_network 2025-08-18 16:23:03 +02:00
Louis Dureuil
9a16c3a26d file-store: persist returns the persisted File object 2025-08-18 16:23:03 +02:00
Louis Dureuil
708bb766b0 Dump support for network 2025-08-18 16:23:02 +02:00
Louis Dureuil
be065c4c51 Proxy all document tasks to the network when sharding is enabled 2025-08-18 16:23:02 +02:00
Louis Dureuil
cda5995922 Shard documents 2025-08-18 16:23:02 +02:00
Louis Dureuil
409ae70f0f network: add sharding to Network and writeApiKey to Remotes 2025-08-18 16:23:02 +02:00
Louis Dureuil
47b8e53985 Add proxy module to proxy requests to members of a network 2025-08-18 16:23:02 +02:00
Louis Dureuil
eadd1bb5b5 Add new milli::update:🆕:indexer::sharding module 2025-08-18 16:23:02 +02:00
Louis Dureuil
3a84f27738 Add network to Task and TaskView 2025-08-18 16:23:02 +02:00
Louis Dureuil
401b064917 Add EE license 2025-08-18 16:23:02 +02:00
100 changed files with 2467 additions and 982 deletions

931
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -23,7 +23,7 @@ members = [
]
[workspace.package]
version = "1.18.0"
version = "1.22.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@@ -19,3 +19,11 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
---
🔒 Meilisearch Enterprise Edition (EE)
Certain parts of this codebase are not licensed under the MIT license and governed by the Business Source License 1.1.
See the LICENSE-EE file for details.

67
LICENSE-EE Normal file
View File

@@ -0,0 +1,67 @@
Business Source License 1.1 Adapted for Meili SAS
This license is based on the Business Source License version 1.1, as published by MariaDB Corporation Ab.
Parameters
Licensor: Meili SAS
Licensed Work: Any file explicitly marked as “Enterprise Edition (EE)” or “governed by the Business Source License”.
Additional Use Grant:
You may use, modify, and distribute the Licensed Work for non-production purposes only, such as testing, development, or evaluation.
Production use of the Licensed Work requires a commercial license agreement with Meilisearch. Contact bonjour@meilisearch.com for licensing.
Change License: MIT
Change Date: Four years from the date the Licensed Work is published.
This License does not apply to any code outside of the Licensed Work, which remains under the MIT license.
For information about alternative licensing arrangements for the Licensed Work,
please contact bonjour@meilisearch.com or sales@meilisearch.com.
Notice
Business Source License 1.1
Terms
The Licensor hereby grants you the right to copy, modify, create derivative
works, redistribute, and make non-production use of the Licensed Work. The
Licensor may make an Additional Use Grant, above, permitting limited production use.
Effective on the Change Date, or the fourth anniversary of the first publicly
available distribution of a specific version of the Licensed Work under this
License, whichever comes first, the Licensor hereby grants you rights under
the terms of the Change License, and the rights granted in the paragraph
above terminate.
If your use of the Licensed Work does not comply with the requirements
currently in effect as described in this License, you must purchase a
commercial license from the Licensor, its affiliated entities, or authorized
resellers, or you must refrain from using the Licensed Work.
All copies of the original and modified Licensed Work, and derivative works
of the Licensed Work, are subject to this License. This License applies
separately for each version of the Licensed Work and the Change Date may vary
for each version of the Licensed Work released by Licensor.
You must conspicuously display this License on each original or modified copy
of the Licensed Work. If you receive the Licensed Work in original or
modified form from a third party, the terms and conditions set forth in this
License apply to your use of that work.
Any use of the Licensed Work in violation of this License will automatically
terminate your rights under this License for the current and all other
versions of the Licensed Work.
This License does not grant you any right in any trademark or logo of
Licensor or its affiliates (provided that you may use a trademark or logo of
Licensor as expressly required by this License).
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
TITLE.

View File

@@ -89,6 +89,26 @@ We also offer a wide range of dedicated guides to all Meilisearch features, such
Finally, for more in-depth information, refer to our articles explaining fundamental Meilisearch concepts such as [documents](https://www.meilisearch.com/docs/learn/core_concepts/documents?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=advanced) and [indexes](https://www.meilisearch.com/docs/learn/core_concepts/indexes?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=advanced).
## 🧾 Editions & Licensing
Meilisearch is available in two editions:
### 🧪 Community Edition (CE)
- Fully open source under the [MIT license](./LICENSE)
- Core search engine with fast and relevant full-text, semantic or hybrid search
- Free to use for anyone, including commercial usage
### 🏢 Enterprise Edition (EE)
- Includes advanced features such as:
- Sharding
- Governed by a [commercial license](./LICENSE-EE) or the [Business Source License 1.1](https://mariadb.com/bsl11)
- Not allowed in production without a commercial agreement with Meilisearch.
- You may use, modify, and distribute the Licensed Work for non-production purposes only, such as testing, development, or evaluation.
Want access to Enterprise features? → Contact us at [sales@meilisearch.com](maito:sales@meilisearch.com).
## 📊 Telemetry
Meilisearch collects **anonymized** user data to help us improve our product. You can [deactivate this](https://www.meilisearch.com/docs/learn/what_is_meilisearch/telemetry?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=telemetry#how-to-disable-data-collection) whenever you want.

View File

@@ -154,6 +154,7 @@ fn indexing_songs_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -221,6 +222,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -266,6 +268,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -335,6 +338,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -412,6 +416,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -457,6 +462,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -498,6 +504,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -566,6 +573,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -633,6 +641,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -700,6 +709,7 @@ fn indexing_wiki(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -766,6 +776,7 @@ fn reindexing_wiki(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -811,6 +822,7 @@ fn reindexing_wiki(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -879,6 +891,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -956,6 +969,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1002,6 +1016,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1044,6 +1059,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1111,6 +1127,7 @@ fn indexing_movies_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1177,6 +1194,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1222,6 +1240,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1290,6 +1309,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1404,6 +1424,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1449,6 +1470,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1490,6 +1512,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1580,6 +1603,7 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1671,6 +1695,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1754,6 +1779,7 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1821,6 +1847,7 @@ fn indexing_geo(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1887,6 +1914,7 @@ fn reindexing_geo(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -1932,6 +1960,7 @@ fn reindexing_geo(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2000,6 +2029,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -123,6 +123,7 @@ pub fn base_setup(conf: &Conf) -> Index {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -10,7 +10,7 @@ use meilisearch_types::keys::Key;
use meilisearch_types::milli::update::IndexDocumentsMethod;
use meilisearch_types::settings::Unchecked;
use meilisearch_types::tasks::{
Details, ExportIndexSettings, IndexSwap, KindWithContent, Status, Task, TaskId,
Details, ExportIndexSettings, IndexSwap, KindWithContent, Status, Task, TaskId, TaskNetwork,
};
use meilisearch_types::InstanceUid;
use roaring::RoaringBitmap;
@@ -94,6 +94,8 @@ pub struct TaskDump {
default
)]
pub finished_at: Option<OffsetDateTime>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub network: Option<TaskNetwork>,
}
// A `Kind` specific version made for the dump. If modified you may break the dump.
@@ -172,6 +174,7 @@ impl From<Task> for TaskDump {
enqueued_at: task.enqueued_at,
started_at: task.started_at,
finished_at: task.finished_at,
network: task.network,
}
}
}
@@ -251,11 +254,12 @@ pub(crate) mod test {
use maplit::{btreemap, btreeset};
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats};
use meilisearch_types::facet_values_sort::FacetValuesSort;
use meilisearch_types::features::{Network, Remote, RuntimeTogglableFeatures};
use meilisearch_types::features::RuntimeTogglableFeatures;
use meilisearch_types::index_uid_pattern::IndexUidPattern;
use meilisearch_types::keys::{Action, Key};
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::{self, FilterableAttributesRule};
use meilisearch_types::network::{Network, Remote};
use meilisearch_types::settings::{Checked, FacetingSettings, Settings};
use meilisearch_types::task_view::DetailsView;
use meilisearch_types::tasks::{BatchStopReason, Details, Kind, Status};
@@ -384,6 +388,7 @@ pub(crate) mod test {
enqueued_at: datetime!(2022-11-11 0:00 UTC),
started_at: Some(datetime!(2022-11-20 0:00 UTC)),
finished_at: Some(datetime!(2022-11-21 0:00 UTC)),
network: None,
},
None,
),
@@ -408,6 +413,7 @@ pub(crate) mod test {
enqueued_at: datetime!(2022-11-11 0:00 UTC),
started_at: None,
finished_at: None,
network: None,
},
Some(vec![
json!({ "id": 4, "race": "leonberg" }).as_object().unwrap().clone(),
@@ -427,6 +433,7 @@ pub(crate) mod test {
enqueued_at: datetime!(2022-11-15 0:00 UTC),
started_at: None,
finished_at: None,
network: None,
},
None,
),
@@ -539,7 +546,8 @@ pub(crate) mod test {
fn create_test_network() -> Network {
Network {
local: Some("myself".to_string()),
remotes: maplit::btreemap! {"other".to_string() => Remote { url: "http://test".to_string(), search_api_key: Some("apiKey".to_string()) }},
remotes: maplit::btreemap! {"other".to_string() => Remote { url: "http://test".to_string(), search_api_key: Some("apiKey".to_string()), write_api_key: Some("docApiKey".to_string()) }},
sharding: false,
}
}

View File

@@ -163,6 +163,7 @@ impl CompatV5ToV6 {
enqueued_at: task_view.enqueued_at,
started_at: task_view.started_at,
finished_at: task_view.finished_at,
network: None,
};
(task, content_file)

View File

@@ -24,7 +24,7 @@ pub type Batch = meilisearch_types::batches::Batch;
pub type Key = meilisearch_types::keys::Key;
pub type ChatCompletionSettings = meilisearch_types::features::ChatCompletionSettings;
pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures;
pub type Network = meilisearch_types::features::Network;
pub type Network = meilisearch_types::network::Network;
pub type Webhooks = meilisearch_types::webhooks::WebhooksDumpView;
// ===== Other types to clarify the code of the compat module

View File

@@ -5,8 +5,9 @@ use std::path::PathBuf;
use flate2::write::GzEncoder;
use flate2::Compression;
use meilisearch_types::batches::Batch;
use meilisearch_types::features::{ChatCompletionSettings, Network, RuntimeTogglableFeatures};
use meilisearch_types::features::{ChatCompletionSettings, RuntimeTogglableFeatures};
use meilisearch_types::keys::Key;
use meilisearch_types::network::Network;
use meilisearch_types::settings::{Checked, Settings};
use meilisearch_types::webhooks::WebhooksDumpView;
use serde_json::{Map, Value};

View File

@@ -148,11 +148,10 @@ impl File {
Ok(Self { path: PathBuf::new(), file: None })
}
pub fn persist(self) -> Result<()> {
if let Some(file) = self.file {
file.persist(&self.path)?;
}
Ok(())
pub fn persist(self) -> Result<Option<StdFile>> {
let Some(file) = self.file else { return Ok(None) };
Ok(Some(file.persist(&self.path)?))
}
}

View File

@@ -129,6 +129,7 @@ fn main() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -147,6 +147,7 @@ impl<'a> Dump<'a> {
canceled_by: task.canceled_by,
details: task.details,
status: task.status,
network: task.network,
kind: match task.kind {
KindDump::DocumentImport {
primary_key,

View File

@@ -1,8 +1,9 @@
use std::sync::{Arc, RwLock};
use meilisearch_types::features::{InstanceTogglableFeatures, Network, RuntimeTogglableFeatures};
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RwTxn, WithoutTls};
use meilisearch_types::network::Network;
use crate::error::FeatureNotEnabledError;
use crate::Result;

View File

@@ -143,10 +143,10 @@ impl IndexStats {
///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
let arroy_stats = index.arroy_stats(rtxn)?;
let hannoy_stats = index.hannoy_stats(rtxn)?;
Ok(IndexStats {
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
number_of_embedded_documents: Some(arroy_stats.documents.len()),
number_of_embeddings: Some(hannoy_stats.number_of_embeddings),
number_of_embedded_documents: Some(hannoy_stats.documents.len()),
documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(),
number_of_documents: None,
database_size: index.on_disk_size()?,

View File

@@ -230,6 +230,7 @@ pub fn snapshot_task(task: &Task) -> String {
details,
status,
kind,
network,
} = task;
snap.push('{');
snap.push_str(&format!("uid: {uid}, "));
@@ -247,6 +248,9 @@ pub fn snapshot_task(task: &Task) -> String {
snap.push_str(&format!("details: {}, ", &snapshot_details(details)));
}
snap.push_str(&format!("kind: {kind:?}"));
if let Some(network) = network {
snap.push_str(&format!("network: {network:?}, "))
}
snap.push('}');
snap

View File

@@ -52,7 +52,7 @@ use flate2::bufread::GzEncoder;
use flate2::Compression;
use meilisearch_types::batches::Batch;
use meilisearch_types::features::{
ChatCompletionSettings, InstanceTogglableFeatures, Network, RuntimeTogglableFeatures,
ChatCompletionSettings, InstanceTogglableFeatures, RuntimeTogglableFeatures,
};
use meilisearch_types::heed::byteorder::BE;
use meilisearch_types::heed::types::{DecodeIgnore, SerdeJson, Str, I128};
@@ -63,8 +63,9 @@ use meilisearch_types::milli::vector::{
Embedder, EmbedderOptions, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment,
};
use meilisearch_types::milli::{self, Index};
use meilisearch_types::network::Network;
use meilisearch_types::task_view::TaskView;
use meilisearch_types::tasks::{KindWithContent, Task};
use meilisearch_types::tasks::{KindWithContent, Task, TaskNetwork};
use meilisearch_types::webhooks::{Webhook, WebhooksDumpView, WebhooksView};
use milli::vector::db::IndexEmbeddingConfig;
use processing::ProcessingTasks;
@@ -666,6 +667,16 @@ impl IndexScheduler {
self.queue.get_task_ids_from_authorized_indexes(&rtxn, query, filters, &processing)
}
pub fn set_task_network(&self, task_id: TaskId, network: TaskNetwork) -> Result<()> {
let mut wtxn = self.env.write_txn()?;
let mut task =
self.queue.tasks.get_task(&wtxn, task_id)?.ok_or(Error::TaskNotFound(task_id))?;
task.network = Some(network);
self.queue.tasks.all_tasks.put(&mut wtxn, &task_id, &task)?;
wtxn.commit()?;
Ok(())
}
/// Return the batches matching the query from the user's point of view along
/// with the total number of batches matching the query, ignoring from and limit.
///

View File

@@ -279,6 +279,7 @@ impl Queue {
details: kind.default_details(),
status: Status::Enqueued,
kind: kind.clone(),
network: None,
};
// For deletion and cancelation tasks, we want to make extra sure that they
// don't attempt to delete/cancel tasks that are newer than themselves.

View File

@@ -97,7 +97,22 @@ impl TaskQueue {
Ok(self.all_tasks.get(rtxn, &task_id)?)
}
pub(crate) fn update_task(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> {
/// Update the inverted task indexes and write the new value of the task.
///
/// The passed `task` object typically comes from a previous transaction, so two kinds of modification might have occurred:
/// 1. Modification to the `task` object after loading it from the DB (the purpose of this method is to persist these changes)
/// 2. Modification to the task committed by another transaction in the DB (an annoying consequence of having lost the original
/// transaction from which the `task` instance was deserialized)
///
/// When calling this function, this `task` is modified to take into account any existing `network`
/// that can have been added since the task was loaded into memory.
///
/// Any other modification to the task that was committed from the DB since the parameter was pulled from the DB will be overwritten.
///
/// # Errors
///
/// - CorruptedTaskQueue: The task doesn't exist in the database
pub(crate) fn update_task(&self, wtxn: &mut RwTxn, task: &mut Task) -> Result<()> {
let old_task = self.get_task(wtxn, task.uid)?.ok_or(Error::CorruptedTaskQueue)?;
let reprocessing = old_task.status != Status::Enqueued;
@@ -157,6 +172,12 @@ impl TaskQueue {
}
}
task.network = match (old_task.network, task.network.take()) {
(None, None) => None,
(None, Some(network)) | (Some(network), None) => Some(network),
(Some(_), Some(network)) => Some(network),
};
self.all_tasks.put(wtxn, &task.uid, task)?;
Ok(())
}

View File

@@ -268,7 +268,7 @@ impl IndexScheduler {
self.queue
.tasks
.update_task(&mut wtxn, &task)
.update_task(&mut wtxn, &mut task)
.map_err(|e| Error::UnrecoverableError(Box::new(e)))?;
}
if let Some(canceled_by) = canceled_by {
@@ -349,7 +349,7 @@ impl IndexScheduler {
self.queue
.tasks
.update_task(&mut wtxn, &task)
.update_task(&mut wtxn, &mut task)
.map_err(|e| Error::UnrecoverableError(Box::new(e)))?;
}
}

View File

@@ -147,7 +147,6 @@ impl IndexScheduler {
};
let mut index_wtxn = index.write_txn()?;
let index_version = index.get_version(&index_wtxn)?.unwrap_or((1, 12, 0));
let package_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH);
if index_version != package_version {

View File

@@ -66,6 +66,11 @@ impl IndexScheduler {
}
IndexOperation::DocumentOperation { index_uid, primary_key, operations, mut tasks } => {
progress.update_progress(DocumentOperationProgress::RetrievingConfig);
let network = self.network();
let shards = network.shards();
// TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches.
// this is made difficult by the fact we're doing private clones of the index scheduler and sending it
// to a fresh thread.
@@ -130,6 +135,7 @@ impl IndexScheduler {
&mut new_fields_ids_map,
&|| must_stop_processing.get(),
progress.clone(),
shards.as_ref(),
)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -57,7 +57,7 @@ girafo: { number_of_documents: 0, field_distribution: {} }
[timestamp] [4,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.18.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.22.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, stop reason: "created batch containing only task with id 1 of type `indexCreation` that cannot be batched with any other task.", }
2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 2 of type `indexCreation` that cannot be batched with any other task.", }
3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 3 of type `indexCreation` that cannot be batched with any other task.", }

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
----------------------------------------------------------------------
### Status:
enqueued [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
----------------------------------------------------------------------
### Status:

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
----------------------------------------------------------------------
### Status:
@@ -37,7 +37,7 @@ catto [1,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.18.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.22.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
----------------------------------------------------------------------
@@ -40,7 +40,7 @@ doggo [2,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.18.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.22.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 18, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 22, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, status: enqueued, details: { primary_key: Some("bone"), old_new_uid: None, new_index_uid: None }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -43,7 +43,7 @@ doggo [2,3,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.18.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.22.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -42,6 +42,7 @@ pub fn upgrade_index_scheduler(
(1, 16, _) => 0,
(1, 17, _) => 0,
(1, 18, _) => 0,
(1, 22, _) => 0,
(major, minor, patch) => {
if major > current_major
|| (major == current_major && minor > current_minor)
@@ -91,6 +92,7 @@ pub fn upgrade_index_scheduler(
details: Some(Details::UpgradeDatabase { from, to }),
status: Status::Enqueued,
kind: KindWithContent::UpgradeDatabase { from },
network: None,
},
)?;
wtxn.commit()?;

View File

@@ -1,6 +1,5 @@
//! Utility functions on the DBs. Mainly getter and setters.
use crate::milli::progress::EmbedderStats;
use std::collections::{BTreeSet, HashSet};
use std::ops::Bound;
use std::sync::Arc;
@@ -15,6 +14,7 @@ use meilisearch_types::tasks::{
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use crate::milli::progress::EmbedderStats;
use crate::{Error, Result, Task, TaskId, BEI128};
/// This structure contains all the information required to write a batch in the database without reading the tasks.
@@ -377,6 +377,7 @@ impl crate::IndexScheduler {
details,
status,
kind,
network: _,
} = task;
assert_eq!(uid, task.uid);
if task.status != Status::Enqueued {

View File

@@ -235,9 +235,11 @@ InvalidDocumentFields , InvalidRequest , BAD_REQU
InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
MissingDocumentEditionFunction , InvalidRequest , BAD_REQUEST ;
InconsistentDocumentChangeHeaders , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentSort , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
InvalidHeaderValue , InvalidRequest , BAD_REQUEST ;
InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ;
InvalidVectorsType , InvalidRequest , BAD_REQUEST ;
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
@@ -266,7 +268,9 @@ InvalidMultiSearchRemote , InvalidRequest , BAD_REQU
InvalidMultiSearchWeight , InvalidRequest , BAD_REQUEST ;
InvalidNetworkRemotes , InvalidRequest , BAD_REQUEST ;
InvalidNetworkSelf , InvalidRequest , BAD_REQUEST ;
InvalidNetworkSharding , InvalidRequest , BAD_REQUEST ;
InvalidNetworkSearchApiKey , InvalidRequest , BAD_REQUEST ;
InvalidNetworkWriteApiKey , InvalidRequest , BAD_REQUEST ;
InvalidNetworkUrl , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;

View File

@@ -1,5 +1,3 @@
use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use crate::error::{Code, ResponseError};
@@ -32,23 +30,6 @@ pub struct InstanceTogglableFeatures {
pub contains_filter: bool,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct Remote {
pub url: String,
#[serde(default)]
pub search_api_key: Option<String>,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub struct Network {
#[serde(default, rename = "self")]
pub local: Option<String>,
#[serde(default)]
pub remotes: BTreeMap<String, Remote>,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub struct ChatCompletionSettings {

View File

@@ -10,6 +10,7 @@ pub mod index_uid;
pub mod index_uid_pattern;
pub mod keys;
pub mod locales;
pub mod network;
pub mod settings;
pub mod star_or;
pub mod task_view;

View File

@@ -0,0 +1,47 @@
// Copyright © 2025 Meilisearch Some Rights Reserved
// This file is part of Meilisearch Enterprise Edition (EE).
// Use of this source code is governed by the Business Source License 1.1,
// as found in the LICENSE-EE file or at <https://mariadb.com/bsl11>
use std::collections::BTreeMap;
use milli::update::new::indexer::sharding::Shards;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub struct Network {
#[serde(default, rename = "self")]
pub local: Option<String>,
#[serde(default)]
pub remotes: BTreeMap<String, Remote>,
#[serde(default)]
pub sharding: bool,
}
impl Network {
pub fn shards(&self) -> Option<Shards> {
if self.sharding {
let this = self.local.as_deref().expect("Inconsistent `sharding` and `self`");
let others = self
.remotes
.keys()
.filter(|name| name.as_str() != this)
.map(|name| name.to_owned())
.collect();
Some(Shards { own: vec![this.to_owned()], others })
} else {
None
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct Remote {
pub url: String,
#[serde(default)]
pub search_api_key: Option<String>,
#[serde(default)]
pub write_api_key: Option<String>,
}

View File

@@ -11,6 +11,7 @@ use crate::error::ResponseError;
use crate::settings::{Settings, Unchecked};
use crate::tasks::{
serialize_duration, Details, DetailsExportIndexSettings, IndexSwap, Kind, Status, Task, TaskId,
TaskNetwork,
};
#[derive(Debug, Clone, PartialEq, Serialize, ToSchema)]
@@ -51,6 +52,9 @@ pub struct TaskView {
#[schema(value_type = String, example = json!("2024-08-08_14:12:09.393Z"))]
#[serde(with = "time::serde::rfc3339::option", default)]
pub finished_at: Option<OffsetDateTime>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub network: Option<TaskNetwork>,
}
impl TaskView {
@@ -68,6 +72,7 @@ impl TaskView {
enqueued_at: task.enqueued_at,
started_at: task.started_at,
finished_at: task.finished_at,
network: task.network.clone(),
}
}
}

View File

@@ -42,6 +42,9 @@ pub struct Task {
pub status: Status,
pub kind: KindWithContent,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub network: Option<TaskNetwork>,
}
impl Task {
@@ -737,6 +740,36 @@ pub enum Details {
},
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)]
#[serde(untagged, rename_all = "camelCase")]
pub enum TaskNetwork {
Origin { origin: Origin },
Remotes { remote_tasks: BTreeMap<String, RemoteTask> },
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct Origin {
pub remote_name: String,
pub task_uid: usize,
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct RemoteTask {
#[serde(skip_serializing_if = "Option::is_none")]
task_uid: Option<TaskId>,
error: Option<ResponseError>,
}
impl From<Result<TaskId, ResponseError>> for RemoteTask {
fn from(res: Result<TaskId, ResponseError>) -> RemoteTask {
match res {
Ok(task_uid) => RemoteTask { task_uid: Some(task_uid), error: None },
Err(err) => RemoteTask { task_uid: None, error: Some(err) },
}
}
}
#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)]
#[schema(rename_all = "camelCase")]
pub struct DetailsExportIndexSettings {

View File

@@ -115,6 +115,9 @@ utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] }
async-openai = { git = "https://github.com/meilisearch/async-openai", branch = "better-error-handling" }
secrecy = "0.10.3"
actix-web-lab = { version = "0.24.1", default-features = false }
urlencoding = "2.1.3"
backoff = { version = "0.4.0", features = ["tokio"] }
[dev-dependencies]
actix-rt = "2.10.0"
@@ -125,7 +128,6 @@ manifest-dir-macros = "0.1.18"
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" }
temp-env = "0.3.6"
urlencoding = "2.1.3"
wiremock = "0.6.3"
yaup = "0.3.1"

View File

@@ -9,6 +9,8 @@ use meilisearch_types::milli::OrderBy;
use serde_json::Value;
use tokio::task::JoinError;
use crate::routes::indexes::{PROXY_ORIGIN_REMOTE_HEADER, PROXY_ORIGIN_TASK_UID_HEADER};
#[derive(Debug, thiserror::Error)]
pub enum MeilisearchHttpError {
#[error("A Content-Type header is missing. Accepted values for the Content-Type header are: {}",
@@ -80,6 +82,16 @@ pub enum MeilisearchHttpError {
MissingSearchHybrid,
#[error("Invalid request: both `media` and `vector` parameters are present.")]
MediaAndVector,
#[error("Inconsistent `Origin` headers: {} was provided but {} is missing.\n - Hint: Either both headers should be provided, or none of them", if *is_remote_missing {
PROXY_ORIGIN_TASK_UID_HEADER
} else { PROXY_ORIGIN_REMOTE_HEADER },
if *is_remote_missing {
PROXY_ORIGIN_REMOTE_HEADER
} else { PROXY_ORIGIN_TASK_UID_HEADER }
)]
InconsistentOriginHeaders { is_remote_missing: bool },
#[error("Invalid value for header {header_name}: {msg}")]
InvalidHeaderValue { header_name: &'static str, msg: String },
}
impl MeilisearchHttpError {
@@ -124,6 +136,10 @@ impl ErrorCode for MeilisearchHttpError {
MeilisearchHttpError::InconsistentFacetOrder { .. } => {
Code::InvalidMultiSearchFacetOrder
}
MeilisearchHttpError::InconsistentOriginHeaders { .. } => {
Code::InconsistentDocumentChangeHeaders
}
MeilisearchHttpError::InvalidHeaderValue { .. } => Code::InvalidHeaderValue,
}
}
}

View File

@@ -628,6 +628,7 @@ fn import_dump(
&mut new_fields_ids_map,
&|| false, // never stop processing a dump
progress.clone(),
None,
)?;
let operation_stats = operation_stats.pop().unwrap();

View File

@@ -45,6 +45,7 @@ use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::payload::Payload;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::proxy::{proxy, Body};
use crate::routes::indexes::search::fix_sort_query_parameters;
use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
@@ -338,6 +339,7 @@ pub async fn delete_document(
) -> Result<HttpResponse, ResponseError> {
let DocumentParam { index_uid, document_id } = path.into_inner();
let index_uid = IndexUid::try_from(index_uid)?;
let network = index_scheduler.network();
analytics.publish(
DocumentsDeletionAggregator {
@@ -355,10 +357,16 @@ pub async fn delete_document(
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
let task = {
let index_scheduler = index_scheduler.clone();
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)).await??
};
if network.sharding && !dry_run {
proxy(&index_scheduler, &index_uid, &req, network, Body::none(), &task).await?;
}
let task: SummarizedTaskView = task.into();
debug!("returns: {:?}", task);
Ok(HttpResponse::Accepted().json(task))
}
@@ -804,7 +812,6 @@ pub async fn replace_documents(
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task = document_addition(
extract_mime_type(&req)?,
index_scheduler,
index_uid,
params.primary_key,
@@ -814,8 +821,10 @@ pub async fn replace_documents(
uid,
dry_run,
allow_index_creation,
&req,
)
.await?;
debug!(returns = ?task, "Replace documents");
Ok(HttpResponse::Accepted().json(task))
@@ -905,7 +914,6 @@ pub async fn update_documents(
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task = document_addition(
extract_mime_type(&req)?,
index_scheduler,
index_uid,
params.primary_key,
@@ -915,6 +923,7 @@ pub async fn update_documents(
uid,
dry_run,
allow_index_creation,
&req,
)
.await?;
debug!(returns = ?task, "Update documents");
@@ -924,7 +933,6 @@ pub async fn update_documents(
#[allow(clippy::too_many_arguments)]
async fn document_addition(
mime_type: Option<Mime>,
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
index_uid: IndexUid,
primary_key: Option<String>,
@@ -934,7 +942,11 @@ async fn document_addition(
task_id: Option<TaskId>,
dry_run: bool,
allow_index_creation: bool,
req: &HttpRequest,
) -> Result<SummarizedTaskView, MeilisearchHttpError> {
let mime_type = extract_mime_type(req)?;
let network = index_scheduler.network();
let format = match (
mime_type.as_ref().map(|m| (m.type_().as_str(), m.subtype().as_str())),
csv_delimiter,
@@ -966,7 +978,7 @@ async fn document_addition(
};
let (uuid, mut update_file) = index_scheduler.queue.create_update_file(dry_run)?;
let documents_count = match format {
let res = match format {
PayloadType::Ndjson => {
let (path, file) = update_file.into_parts();
let file = match file {
@@ -981,19 +993,19 @@ async fn document_addition(
None => None,
};
let documents_count = tokio::task::spawn_blocking(move || {
let res = tokio::task::spawn_blocking(move || {
let documents_count = file.as_ref().map_or(Ok(0), |ntf| {
read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat)
})?;
let update_file = file_store::File::from_parts(path, file);
update_file.persist()?;
let update_file = update_file.persist()?;
Ok(documents_count)
Ok((documents_count, update_file))
})
.await?;
Ok(documents_count)
Ok(res)
}
PayloadType::Json | PayloadType::Csv { delimiter: _ } => {
let temp_file = match tempfile() {
@@ -1012,16 +1024,16 @@ async fn document_addition(
unreachable!("We already wrote the user content into the update file")
}
};
// we NEED to persist the file here because we moved the `udpate_file` in another task.
update_file.persist()?;
Ok(documents_count)
// we NEED to persist the file here because we moved the `update_file` in another task.
let file = update_file.persist()?;
Ok((documents_count, file))
})
.await
}
};
let documents_count = match documents_count {
Ok(Ok(documents_count)) => documents_count,
let (documents_count, file) = match res {
Ok(Ok((documents_count, file))) => (documents_count, file),
// in this case the file has not possibly be persisted.
Ok(Err(e)) => return Err(e),
Err(e) => {
@@ -1063,6 +1075,20 @@ async fn document_addition(
}
};
if network.sharding {
if let Some(file) = file {
proxy(
&index_scheduler,
&index_uid,
req,
network,
Body::with_ndjson_payload(file),
&task,
)
.await?;
}
}
Ok(task.into())
}
@@ -1141,6 +1167,7 @@ pub async fn delete_documents_batch(
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Delete documents by batch");
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let network = index_scheduler.network();
analytics.publish(
DocumentsDeletionAggregator {
@@ -1161,16 +1188,22 @@ pub async fn delete_documents_batch(
KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
let task = {
let index_scheduler = index_scheduler.clone();
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)).await??
};
if network.sharding && !dry_run {
proxy(&index_scheduler, &index_uid, &req, network, Body::Inline(body), &task).await?;
}
let task: SummarizedTaskView = task.into();
debug!(returns = ?task, "Delete documents by batch");
Ok(HttpResponse::Accepted().json(task))
}
#[derive(Debug, Deserr, ToSchema)]
#[derive(Debug, Deserr, ToSchema, Serialize)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
#[schema(rename_all = "camelCase")]
pub struct DocumentDeletionByFilter {
@@ -1219,7 +1252,8 @@ pub async fn delete_documents_by_filter(
debug!(parameters = ?body, "Delete documents by filter");
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let index_uid = index_uid.into_inner();
let filter = body.into_inner().filter;
let filter = body.into_inner();
let network = index_scheduler.network();
analytics.publish(
DocumentsDeletionAggregator {
@@ -1232,23 +1266,36 @@ pub async fn delete_documents_by_filter(
);
// we ensure the filter is well formed before enqueuing it
crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())?
.ok_or(MeilisearchHttpError::EmptyFilter)?;
crate::search::parse_filter(
&filter.filter,
Code::InvalidDocumentFilter,
index_scheduler.features(),
)?
.ok_or(MeilisearchHttpError::EmptyFilter)?;
let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter };
let task = KindWithContent::DocumentDeletionByFilter {
index_uid: index_uid.clone(),
filter_expr: filter.filter.clone(),
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
let task = {
let index_scheduler = index_scheduler.clone();
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)).await??
};
if network.sharding && !dry_run {
proxy(&index_scheduler, &index_uid, &req, network, Body::Inline(filter), &task).await?;
}
let task: SummarizedTaskView = task.into();
debug!(returns = ?task, "Delete documents by filter");
Ok(HttpResponse::Accepted().json(task))
}
#[derive(Debug, Deserr, ToSchema)]
#[derive(Debug, Deserr, ToSchema, Serialize)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct DocumentEditionByFunction {
/// A string containing a RHAI function.
@@ -1336,6 +1383,8 @@ pub async fn edit_documents_by_function(
.features()
.check_edit_documents_by_function("Using the documents edit route")?;
let network = index_scheduler.network();
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let index_uid = index_uid.into_inner();
let params = params.into_inner();
@@ -1349,13 +1398,12 @@ pub async fn edit_documents_by_function(
&req,
);
let DocumentEditionByFunction { filter, context, function } = params;
let engine = milli::rhai::Engine::new();
if let Err(e) = engine.compile(&function) {
if let Err(e) = engine.compile(&params.function) {
return Err(ResponseError::from_msg(e.to_string(), Code::BadRequest));
}
if let Some(ref filter) = filter {
if let Some(ref filter) = params.filter {
// we ensure the filter is well formed before enqueuing it
crate::search::parse_filter(
filter,
@@ -1365,9 +1413,9 @@ pub async fn edit_documents_by_function(
.ok_or(MeilisearchHttpError::EmptyFilter)?;
}
let task = KindWithContent::DocumentEdition {
index_uid,
filter_expr: filter,
context: match context {
index_uid: index_uid.clone(),
filter_expr: params.filter.clone(),
context: match params.context.clone() {
Some(Value::Object(m)) => Some(m),
None => None,
_ => {
@@ -1377,15 +1425,21 @@ pub async fn edit_documents_by_function(
))
}
},
function,
function: params.function.clone(),
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
let task = {
let index_scheduler = index_scheduler.clone();
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)).await??
};
if network.sharding && !dry_run {
proxy(&index_scheduler, &index_uid, &req, network, Body::Inline(params), &task).await?;
}
let task: SummarizedTaskView = task.into();
debug!(returns = ?task, "Edit documents by function");
Ok(HttpResponse::Accepted().json(task))
@@ -1428,6 +1482,8 @@ pub async fn clear_all_documents(
analytics: web::Data<Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let network = index_scheduler.network();
analytics.publish(
DocumentsDeletionAggregator {
clear_all: true,
@@ -1441,10 +1497,18 @@ pub async fn clear_all_documents(
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
let task = {
let index_scheduler = index_scheduler.clone();
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)).await??
};
if network.sharding && !dry_run {
proxy(&index_scheduler, &index_uid, &req, network, Body::none(), &task).await?;
}
let task: SummarizedTaskView = task.into();
debug!(returns = ?task, "Delete all documents");
Ok(HttpResponse::Accepted().json(task))

View File

@@ -30,6 +30,7 @@ use crate::Opt;
pub mod documents;
pub mod facet_search;
mod proxy;
pub mod search;
mod search_analytics;
#[cfg(test)]
@@ -39,6 +40,8 @@ mod settings_analytics;
pub mod similar;
mod similar_analytics;
pub use proxy::{PROXY_ORIGIN_REMOTE_HEADER, PROXY_ORIGIN_TASK_UID_HEADER};
#[derive(OpenApi)]
#[openapi(
nest(

View File

@@ -0,0 +1,424 @@
// Copyright © 2025 Meilisearch Some Rights Reserved
// This file is part of Meilisearch Enterprise Edition (EE).
// Use of this source code is governed by the Business Source License 1.1,
// as found in the LICENSE-EE file or at <https://mariadb.com/bsl11>
use std::collections::BTreeMap;
use std::fs::File;
use actix_web::http::header::CONTENT_TYPE;
use actix_web::HttpRequest;
use bytes::Bytes;
use index_scheduler::IndexScheduler;
use meilisearch_types::error::ResponseError;
use meilisearch_types::tasks::{Origin, RemoteTask, TaskNetwork};
use reqwest::StatusCode;
use serde::de::DeserializeOwned;
use serde_json::Value;
use crate::error::MeilisearchHttpError;
use crate::routes::indexes::proxy::error::{ProxyDocumentChangeError, ReqwestErrorWithoutUrl};
use crate::routes::SummarizedTaskView;
pub enum Body<T: serde::Serialize> {
NdJsonPayload(File),
Inline(T),
None,
}
impl Body<()> {
pub fn with_ndjson_payload(file: File) -> Self {
Self::NdJsonPayload(file)
}
pub fn none() -> Self {
Self::None
}
}
/// If necessary, proxies the passed request to the network and update the task description.
///
/// This function reads the custom headers from the request to determine if must proxy the request or if the request
/// has already been proxied.
///
/// - when it must proxy the request, the endpoint, method and query params are retrieved from the passed `req`, then the `body` is
/// sent to all remotes of the `network` (except `self`). The response from the remotes are collected to update the passed `task`
/// with the task ids from the task queues of the remotes.
/// - when the request has already been proxied, the custom headers contains information about the remote that created the initial task.
/// This information is copied to the passed task.
pub async fn proxy<T: serde::Serialize>(
index_scheduler: &IndexScheduler,
index_uid: &str,
req: &HttpRequest,
network: meilisearch_types::network::Network,
body: Body<T>,
task: &meilisearch_types::tasks::Task,
) -> Result<(), MeilisearchHttpError> {
match origin_from_req(req)? {
Some(origin) => {
index_scheduler.set_task_network(task.uid, TaskNetwork::Origin { origin })?
}
None => {
let this = network
.local
.as_deref()
.expect("inconsistent `network.sharding` and `network.self`")
.to_owned();
let content_type = match &body {
// for file bodies, force x-ndjson
Body::NdJsonPayload(_) => Some(b"application/x-ndjson".as_slice()),
// otherwise get content type from request
_ => req.headers().get(CONTENT_TYPE).map(|h| h.as_bytes()),
};
let body = match body {
Body::NdJsonPayload(file) => Some(Bytes::from_owner(unsafe {
memmap2::Mmap::map(&file).map_err(|err| {
MeilisearchHttpError::from_milli(err.into(), Some(index_uid.to_owned()))
})?
})),
Body::Inline(payload) => {
Some(Bytes::copy_from_slice(&serde_json::to_vec(&payload).unwrap()))
}
Body::None => None,
};
let mut in_flight_remote_queries = BTreeMap::new();
let client = reqwest::ClientBuilder::new()
.connect_timeout(std::time::Duration::from_secs(3))
.build()
.unwrap();
let method = from_old_http_method(req.method());
// send payload to all remotes
for (node_name, node) in
network.remotes.into_iter().filter(|(name, _)| name.as_str() != this)
{
let body = body.clone();
let client = client.clone();
let api_key = node.write_api_key;
let this = this.clone();
let method = method.clone();
let path_and_query =
req.uri().path_and_query().map(|paq| paq.as_str()).unwrap_or("/");
in_flight_remote_queries.insert(
node_name,
tokio::spawn({
let url = format!("{}{}", node.url, path_and_query);
let url_encoded_this = urlencoding::encode(&this).into_owned();
let url_encoded_task_uid = task.uid.to_string(); // it's url encoded i promize
let content_type = content_type.map(|b| b.to_owned());
let backoff = backoff::ExponentialBackoffBuilder::new()
.with_max_elapsed_time(Some(std::time::Duration::from_secs(25)))
.build();
backoff::future::retry(backoff, move || {
let url = url.clone();
let client = client.clone();
let url_encoded_this = url_encoded_this.clone();
let url_encoded_task_uid = url_encoded_task_uid.clone();
let content_type = content_type.clone();
let body = body.clone();
let api_key = api_key.clone();
let method = method.clone();
async move {
try_proxy(
method,
&url,
content_type.as_deref(),
api_key.as_deref(),
&client,
&url_encoded_this,
&url_encoded_task_uid,
body,
)
.await
}
})
}),
);
}
// wait for all in-flight queries to finish and collect their results
let mut remote_tasks: BTreeMap<String, RemoteTask> = BTreeMap::new();
for (node_name, handle) in in_flight_remote_queries {
match handle.await {
Ok(Ok(res)) => {
let task_uid = res.task_uid;
remote_tasks.insert(node_name, Ok(task_uid).into());
}
Ok(Err(error)) => {
remote_tasks.insert(node_name, Err(error.as_response_error()).into());
}
Err(panic) => match panic.try_into_panic() {
Ok(panic) => {
let msg = match panic.downcast_ref::<&'static str>() {
Some(s) => *s,
None => match panic.downcast_ref::<String>() {
Some(s) => &s[..],
None => "Box<dyn Any>",
},
};
remote_tasks.insert(
node_name,
Err(ResponseError::from_msg(
msg.to_string(),
meilisearch_types::error::Code::Internal,
))
.into(),
);
}
Err(_) => {
tracing::error!("proxy task was unexpectedly cancelled")
}
},
}
}
// edit details to contain the return values from the remotes
index_scheduler.set_task_network(task.uid, TaskNetwork::Remotes { remote_tasks })?;
}
}
Ok(())
}
fn from_old_http_method(method: &actix_http::Method) -> reqwest::Method {
match method {
&actix_http::Method::CONNECT => reqwest::Method::CONNECT,
&actix_http::Method::DELETE => reqwest::Method::DELETE,
&actix_http::Method::GET => reqwest::Method::GET,
&actix_http::Method::HEAD => reqwest::Method::HEAD,
&actix_http::Method::OPTIONS => reqwest::Method::OPTIONS,
&actix_http::Method::PATCH => reqwest::Method::PATCH,
&actix_http::Method::POST => reqwest::Method::POST,
&actix_http::Method::PUT => reqwest::Method::PUT,
&actix_http::Method::TRACE => reqwest::Method::TRACE,
method => reqwest::Method::from_bytes(method.as_str().as_bytes()).unwrap(),
}
}
#[allow(clippy::too_many_arguments)]
async fn try_proxy(
method: reqwest::Method,
url: &str,
content_type: Option<&[u8]>,
api_key: Option<&str>,
client: &reqwest::Client,
url_encoded_this: &str,
url_encoded_task_uid: &str,
body: Option<Bytes>,
) -> Result<SummarizedTaskView, backoff::Error<ProxyDocumentChangeError>> {
let request = client.request(method, url).timeout(std::time::Duration::from_secs(30));
let request = if let Some(body) = body { request.body(body) } else { request };
let request = if let Some(api_key) = api_key { request.bearer_auth(api_key) } else { request };
let request = request.header(PROXY_ORIGIN_TASK_UID_HEADER, url_encoded_task_uid);
let request = request.header(PROXY_ORIGIN_REMOTE_HEADER, url_encoded_this);
let request = if let Some(content_type) = content_type {
request.header(CONTENT_TYPE.as_str(), content_type)
} else {
request
};
let response = request.send().await;
let response = match response {
Ok(response) => response,
Err(error) if error.is_timeout() => {
return Err(backoff::Error::transient(ProxyDocumentChangeError::Timeout))
}
Err(error) => {
return Err(backoff::Error::transient(ProxyDocumentChangeError::CouldNotSendRequest(
ReqwestErrorWithoutUrl::new(error),
)))
}
};
match response.status() {
status_code if status_code.is_success() => (),
StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
return Err(backoff::Error::Permanent(ProxyDocumentChangeError::AuthenticationError))
}
status_code if status_code.is_client_error() => {
let response = parse_error(response).await;
return Err(backoff::Error::Permanent(ProxyDocumentChangeError::BadRequest {
status_code,
response,
}));
}
status_code if status_code.is_server_error() => {
let response = parse_error(response).await;
return Err(backoff::Error::transient(ProxyDocumentChangeError::RemoteError {
status_code,
response,
}));
}
status_code => {
tracing::warn!(
status_code = status_code.as_u16(),
"remote replied with unexpected status code"
);
}
}
let response = match parse_response(response).await {
Ok(response) => response,
Err(response) => {
return Err(backoff::Error::transient(
ProxyDocumentChangeError::CouldNotParseResponse { response },
))
}
};
Ok(response)
}
async fn parse_error(response: reqwest::Response) -> Result<String, ReqwestErrorWithoutUrl> {
let bytes = match response.bytes().await {
Ok(bytes) => bytes,
Err(error) => return Err(ReqwestErrorWithoutUrl::new(error)),
};
Ok(parse_bytes_as_error(&bytes))
}
fn parse_bytes_as_error(bytes: &[u8]) -> String {
match serde_json::from_slice::<Value>(bytes) {
Ok(value) => value.to_string(),
Err(_) => String::from_utf8_lossy(bytes).into_owned(),
}
}
async fn parse_response<T: DeserializeOwned>(
response: reqwest::Response,
) -> Result<T, Result<String, ReqwestErrorWithoutUrl>> {
let bytes = match response.bytes().await {
Ok(bytes) => bytes,
Err(error) => return Err(Err(ReqwestErrorWithoutUrl::new(error))),
};
match serde_json::from_slice::<T>(&bytes) {
Ok(value) => Ok(value),
Err(_) => Err(Ok(parse_bytes_as_error(&bytes))),
}
}
mod error {
use meilisearch_types::error::ResponseError;
use reqwest::StatusCode;
#[derive(Debug, thiserror::Error)]
pub enum ProxyDocumentChangeError {
#[error("{0}")]
CouldNotSendRequest(ReqwestErrorWithoutUrl),
#[error("could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `documents.add` action")]
AuthenticationError,
#[error(
"could not parse response from the remote host as a document addition response{}\n - hint: check that the remote instance is a Meilisearch instance running the same version",
response_from_remote(response)
)]
CouldNotParseResponse { response: Result<String, ReqwestErrorWithoutUrl> },
#[error("remote host responded with code {}{}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", status_code.as_u16(), response_from_remote(response))]
BadRequest { status_code: StatusCode, response: Result<String, ReqwestErrorWithoutUrl> },
#[error("remote host did not answer before the deadline")]
Timeout,
#[error("remote host responded with code {}{}", status_code.as_u16(), response_from_remote(response))]
RemoteError { status_code: StatusCode, response: Result<String, ReqwestErrorWithoutUrl> },
}
impl ProxyDocumentChangeError {
pub fn as_response_error(&self) -> ResponseError {
use meilisearch_types::error::Code;
let message = self.to_string();
let code = match self {
ProxyDocumentChangeError::CouldNotSendRequest(_) => Code::RemoteCouldNotSendRequest,
ProxyDocumentChangeError::AuthenticationError => Code::RemoteInvalidApiKey,
ProxyDocumentChangeError::BadRequest { .. } => Code::RemoteBadRequest,
ProxyDocumentChangeError::Timeout => Code::RemoteTimeout,
ProxyDocumentChangeError::RemoteError { .. } => Code::RemoteRemoteError,
ProxyDocumentChangeError::CouldNotParseResponse { .. } => Code::RemoteBadResponse,
};
ResponseError::from_msg(message, code)
}
}
#[derive(Debug, thiserror::Error)]
#[error(transparent)]
pub struct ReqwestErrorWithoutUrl(reqwest::Error);
impl ReqwestErrorWithoutUrl {
pub fn new(inner: reqwest::Error) -> Self {
Self(inner.without_url())
}
}
fn response_from_remote(response: &Result<String, ReqwestErrorWithoutUrl>) -> String {
match response {
Ok(response) => {
format!(":\n - response from remote: {}", response)
}
Err(error) => {
format!(":\n - additionally, could not retrieve response from remote: {error}")
}
}
}
}
pub const PROXY_ORIGIN_REMOTE_HEADER: &str = "Meili-Proxy-Origin-Remote";
pub const PROXY_ORIGIN_TASK_UID_HEADER: &str = "Meili-Proxy-Origin-TaskUid";
pub fn origin_from_req(req: &HttpRequest) -> Result<Option<Origin>, MeilisearchHttpError> {
let (remote_name, task_uid) = match (
req.headers().get(PROXY_ORIGIN_REMOTE_HEADER),
req.headers().get(PROXY_ORIGIN_TASK_UID_HEADER),
) {
(None, None) => return Ok(None),
(None, Some(_)) => {
return Err(MeilisearchHttpError::InconsistentOriginHeaders { is_remote_missing: true })
}
(Some(_), None) => {
return Err(MeilisearchHttpError::InconsistentOriginHeaders {
is_remote_missing: false,
})
}
(Some(remote_name), Some(task_uid)) => (
urlencoding::decode(remote_name.to_str().map_err(|err| {
MeilisearchHttpError::InvalidHeaderValue {
header_name: PROXY_ORIGIN_REMOTE_HEADER,
msg: format!("while parsing remote name as UTF-8: {err}"),
}
})?)
.map_err(|err| MeilisearchHttpError::InvalidHeaderValue {
header_name: PROXY_ORIGIN_REMOTE_HEADER,
msg: format!("while URL-decoding remote name: {err}"),
})?,
urlencoding::decode(task_uid.to_str().map_err(|err| {
MeilisearchHttpError::InvalidHeaderValue {
header_name: PROXY_ORIGIN_TASK_UID_HEADER,
msg: format!("while parsing task UID as UTF-8: {err}"),
}
})?)
.map_err(|err| MeilisearchHttpError::InvalidHeaderValue {
header_name: PROXY_ORIGIN_TASK_UID_HEADER,
msg: format!("while URL-decoding task UID: {err}"),
})?,
),
};
let task_uid: usize =
task_uid.parse().map_err(|err| MeilisearchHttpError::InvalidHeaderValue {
header_name: PROXY_ORIGIN_TASK_UID_HEADER,
msg: format!("while parsing the task UID as an integer: {err}"),
})?;
Ok(Some(Origin { remote_name: remote_name.into_owned(), task_uid }))
}

View File

@@ -184,7 +184,7 @@ pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result<bool, ResponseError> {
.is_some_and(|s| s.to_lowercase() == "true"))
}
#[derive(Debug, Serialize, ToSchema)]
#[derive(Debug, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct SummarizedTaskView {
/// The task unique identifier.
@@ -198,7 +198,10 @@ pub struct SummarizedTaskView {
#[serde(rename = "type")]
kind: Kind,
/// The date on which the task was enqueued.
#[serde(serialize_with = "time::serde::rfc3339::serialize")]
#[serde(
serialize_with = "time::serde::rfc3339::serialize",
deserialize_with = "time::serde::rfc3339::deserialize"
)]
enqueued_at: OffsetDateTime,
}

View File

@@ -8,12 +8,13 @@ use index_scheduler::IndexScheduler;
use itertools::{EitherOrBoth, Itertools};
use meilisearch_types::deserr::DeserrJsonError;
use meilisearch_types::error::deserr_codes::{
InvalidNetworkRemotes, InvalidNetworkSearchApiKey, InvalidNetworkSelf, InvalidNetworkUrl,
InvalidNetworkRemotes, InvalidNetworkSearchApiKey, InvalidNetworkSelf, InvalidNetworkSharding,
InvalidNetworkUrl, InvalidNetworkWriteApiKey,
};
use meilisearch_types::error::ResponseError;
use meilisearch_types::features::{Network as DbNetwork, Remote as DbRemote};
use meilisearch_types::keys::actions;
use meilisearch_types::milli::update::Setting;
use meilisearch_types::network::{Network as DbNetwork, Remote as DbRemote};
use serde::Serialize;
use tracing::debug;
use utoipa::{OpenApi, ToSchema};
@@ -57,9 +58,9 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
{
"self": "ms-0",
"remotes": {
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) },
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset, write_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()), write_api_key: Setting::Set("bar".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()), write_api_key: Setting::Set("foo".into()) },
}
})),
(status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!(
@@ -88,9 +89,9 @@ async fn get_network(
#[schema(rename_all = "camelCase")]
pub struct Remote {
#[schema(value_type = Option<String>, example = json!({
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) },
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset, write_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()), write_api_key: Setting::Set("bar".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()), write_api_key: Setting::Set("foo".into()) },
}))]
#[deserr(default, error = DeserrJsonError<InvalidNetworkUrl>)]
#[serde(default)]
@@ -99,6 +100,10 @@ pub struct Remote {
#[deserr(default, error = DeserrJsonError<InvalidNetworkSearchApiKey>)]
#[serde(default)]
pub search_api_key: Setting<String>,
#[schema(value_type = Option<String>, example = json!("XWnBI8QHUc-4IlqbKPLUDuhftNq19mQtjc6JvmivzJU"))]
#[deserr(default, error = DeserrJsonError<InvalidNetworkWriteApiKey>)]
#[serde(default)]
pub write_api_key: Setting<String>,
}
#[derive(Debug, Deserr, ToSchema, Serialize)]
@@ -114,6 +119,10 @@ pub struct Network {
#[serde(default, rename = "self")]
#[deserr(default, rename = "self", error = DeserrJsonError<InvalidNetworkSelf>)]
pub local: Setting<String>,
#[schema(value_type = Option<bool>, example = json!(true))]
#[serde(default)]
#[deserr(default, error = DeserrJsonError<InvalidNetworkSharding>)]
pub sharding: Setting<bool>,
}
impl Remote {
@@ -136,6 +145,7 @@ impl Remote {
Ok(url)
})?,
search_api_key: self.search_api_key.set(),
write_api_key: self.write_api_key.set(),
})
}
}
@@ -174,9 +184,9 @@ impl Aggregate for PatchNetworkAnalytics {
{
"self": "ms-0",
"remotes": {
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) },
"ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset, write_api_key: Setting::Reset },
"ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()), write_api_key: Setting::Set("bar".into()) },
"ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()), write_api_key: Setting::Set("foo".into()) },
}
})),
(status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!(
@@ -207,6 +217,19 @@ async fn patch_network(
Setting::NotSet => old_network.local,
};
let merged_sharding = match new_network.sharding {
Setting::Set(new_sharding) => new_sharding,
Setting::Reset => false,
Setting::NotSet => old_network.sharding,
};
if merged_sharding && merged_self.is_none() {
return Err(ResponseError::from_msg(
"`.sharding`: enabling the sharding requires `.self` to be set\n - Hint: Disable `sharding` or set `self` to a value.".into(),
meilisearch_types::error::Code::InvalidNetworkSharding,
));
}
let merged_remotes = match new_network.remotes {
Setting::Set(new_remotes) => {
let mut merged_remotes = BTreeMap::new();
@@ -217,9 +240,17 @@ async fn patch_network(
{
match either_or_both {
EitherOrBoth::Both((key, old), (_, Some(new))) => {
let DbRemote { url: old_url, search_api_key: old_search_api_key } = old;
let DbRemote {
url: old_url,
search_api_key: old_search_api_key,
write_api_key: old_write_api_key,
} = old;
let Remote { url: new_url, search_api_key: new_search_api_key } = new;
let Remote {
url: new_url,
search_api_key: new_search_api_key,
write_api_key: new_write_api_key,
} = new;
let merged = DbRemote {
url: match new_url {
@@ -247,6 +278,11 @@ async fn patch_network(
Setting::Reset => None,
Setting::NotSet => old_search_api_key,
},
write_api_key: match new_write_api_key {
Setting::Set(new_write_api_key) => Some(new_write_api_key),
Setting::Reset => None,
Setting::NotSet => old_write_api_key,
},
};
merged_remotes.insert(key, merged);
}
@@ -274,7 +310,8 @@ async fn patch_network(
&req,
);
let merged_network = DbNetwork { local: merged_self, remotes: merged_remotes };
let merged_network =
DbNetwork { local: merged_self, remotes: merged_remotes, sharding: merged_sharding };
index_scheduler.put_network(merged_network.clone())?;
debug!(returns = ?merged_network, "Patch network");
Ok(HttpResponse::Ok().json(merged_network))

View File

@@ -10,11 +10,11 @@ use actix_http::StatusCode;
use index_scheduler::{IndexScheduler, RoFeatures};
use itertools::Itertools;
use meilisearch_types::error::ResponseError;
use meilisearch_types::features::{Network, Remote};
use meilisearch_types::milli::order_by_map::OrderByMap;
use meilisearch_types::milli::score_details::{ScoreDetails, WeightedScoreValue};
use meilisearch_types::milli::vector::Embedding;
use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget, DEFAULT_VALUES_PER_FACET};
use meilisearch_types::network::{Network, Remote};
use roaring::RoaringBitmap;
use tokio::task::JoinHandle;

View File

@@ -1,6 +1,6 @@
pub use error::ProxySearchError;
use error::ReqwestErrorWithoutUrl;
use meilisearch_types::features::Remote;
use meilisearch_types::network::Remote;
use rand::Rng as _;
use reqwest::{Client, Response, StatusCode};
use serde::de::DeserializeOwned;

View File

@@ -46,7 +46,7 @@ async fn errors_on_param() {
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Unknown field `selfie`: expected one of `remotes`, `self`",
"message": "Unknown field `selfie`: expected one of `remotes`, `self`, `sharding`",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"
@@ -149,7 +149,7 @@ async fn errors_on_param() {
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Unknown field `doggo` inside `.remotes.new`: expected one of `url`, `searchApiKey`",
"message": "Unknown field `doggo` inside `.remotes.new`: expected one of `url`, `searchApiKey`, `writeApiKey`",
"code": "invalid_network_remotes",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_network_remotes"
@@ -192,9 +192,11 @@ async fn errors_on_param() {
"remotes": {
"kefir": {
"url": "http://localhost:7700",
"searchApiKey": null
"searchApiKey": null,
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
let (response, code) = server
@@ -266,7 +268,8 @@ async fn auth() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "master",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -274,11 +277,12 @@ async fn auth() {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "master",
"remotes": {}
}
"###);
{
"self": "master",
"remotes": {},
"sharding": false
}
"###);
// try get with get permission
server.use_api_key(get_network_key.as_str().unwrap());
@@ -286,11 +290,12 @@ async fn auth() {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "master",
"remotes": {}
}
"###);
{
"self": "master",
"remotes": {},
"sharding": false
}
"###);
// try update with update permission
server.use_api_key(update_network_key.as_str().unwrap());
@@ -303,11 +308,12 @@ async fn auth() {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "api_key",
"remotes": {}
}
"###);
{
"self": "api_key",
"remotes": {},
"sharding": false
}
"###);
// try with the other's permission
let (response, code) = server.get_network().await;
@@ -383,7 +389,8 @@ async fn get_and_set_network() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": null,
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -393,7 +400,8 @@ async fn get_and_set_network() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "myself",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -417,13 +425,16 @@ async fn get_and_set_network() {
"remotes": {
"myself": {
"url": "http://localhost:7700",
"searchApiKey": null
"searchApiKey": null,
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "foo"
"searchApiKey": "foo",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -443,13 +454,16 @@ async fn get_and_set_network() {
"remotes": {
"myself": {
"url": "http://localhost:7700",
"searchApiKey": null
"searchApiKey": null,
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
"searchApiKey": "bar",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -470,17 +484,21 @@ async fn get_and_set_network() {
"remotes": {
"myself": {
"url": "http://localhost:7700",
"searchApiKey": null
"searchApiKey": null,
"writeApiKey": null
},
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
"searchApiKey": "bar",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -498,13 +516,16 @@ async fn get_and_set_network() {
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
"searchApiKey": "bar",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -518,13 +539,16 @@ async fn get_and_set_network() {
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
"searchApiKey": "bar",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -538,13 +562,16 @@ async fn get_and_set_network() {
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
"searchApiKey": "bar",
"writeApiKey": null
}
}
},
"sharding": false
}
"###);
@@ -553,60 +580,69 @@ async fn get_and_set_network() {
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
}
}
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar",
"writeApiKey": null
}
"###);
},
"sharding": false
}
"###);
// still doing nothing
let (response, code) = server.set_network(json!({"remotes": {}})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
}
}
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar",
"writeApiKey": null
}
"###);
},
"sharding": false
}
"###);
// good time to check GET
let (response, code) = server.get_network().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz"
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar"
}
}
{
"self": "thy",
"remotes": {
"them": {
"url": "http://localhost:7702",
"searchApiKey": "baz",
"writeApiKey": null
},
"thy": {
"url": "http://localhost:7701",
"searchApiKey": "bar",
"writeApiKey": null
}
"###);
},
"sharding": false
}
"###);
// deleting everything
let (response, code) = server
@@ -619,7 +655,8 @@ async fn get_and_set_network() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"self": "thy",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
}

View File

@@ -132,7 +132,8 @@ async fn remote_sharding() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -140,7 +141,8 @@ async fn remote_sharding() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms2.set_network(json!({"self": "ms2"})).await;
@@ -148,7 +150,8 @@ async fn remote_sharding() {
snapshot!(json_string!(response), @r###"
{
"self": "ms2",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -444,7 +447,8 @@ async fn remote_sharding_retrieve_vectors() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -452,7 +456,8 @@ async fn remote_sharding_retrieve_vectors() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms2.set_network(json!({"self": "ms2"})).await;
@@ -460,7 +465,8 @@ async fn remote_sharding_retrieve_vectors() {
snapshot!(json_string!(response), @r###"
{
"self": "ms2",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -934,7 +940,8 @@ async fn error_unregistered_remote() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -942,7 +949,8 @@ async fn error_unregistered_remote() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1052,7 +1060,8 @@ async fn error_no_weighted_score() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1060,7 +1069,8 @@ async fn error_no_weighted_score() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1185,7 +1195,8 @@ async fn error_bad_response() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1193,7 +1204,8 @@ async fn error_bad_response() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1322,7 +1334,8 @@ async fn error_bad_request() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1330,7 +1343,8 @@ async fn error_bad_request() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1452,7 +1466,8 @@ async fn error_bad_request_facets_by_index() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1460,7 +1475,8 @@ async fn error_bad_request_facets_by_index() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1593,7 +1609,8 @@ async fn error_bad_request_facets_by_index_facet() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1601,7 +1618,8 @@ async fn error_bad_request_facets_by_index_facet() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1743,7 +1761,8 @@ async fn error_remote_does_not_answer() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1751,7 +1770,8 @@ async fn error_remote_does_not_answer() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -1944,7 +1964,8 @@ async fn error_remote_404() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -1952,7 +1973,8 @@ async fn error_remote_404() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -2139,7 +2161,8 @@ async fn error_remote_sharding_auth() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -2147,7 +2170,8 @@ async fn error_remote_sharding_auth() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -2299,7 +2323,8 @@ async fn remote_sharding_auth() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -2307,7 +2332,8 @@ async fn remote_sharding_auth() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -2454,7 +2480,8 @@ async fn error_remote_500() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -2462,7 +2489,8 @@ async fn error_remote_500() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
@@ -2633,7 +2661,8 @@ async fn error_remote_500_once() {
snapshot!(json_string!(response), @r###"
{
"self": "ms0",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);
let (response, code) = ms1.set_network(json!({"self": "ms1"})).await;
@@ -2641,7 +2670,8 @@ async fn error_remote_500_once() {
snapshot!(json_string!(response), @r###"
{
"self": "ms1",
"remotes": {}
"remotes": {},
"sharding": false
}
"###);

View File

@@ -43,7 +43,7 @@ async fn version_too_old() {
std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.18.0");
snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.22.0");
}
#[actix_rt::test]
@@ -58,7 +58,7 @@ async fn version_requires_downgrade() {
std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.18.1 is higher than the Meilisearch version 1.18.0. Downgrade is not supported");
snapshot!(err, @"Database version 1.22.1 is higher than the Meilisearch version 1.22.0. Downgrade is not supported");
}
#[actix_rt::test]

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.18.0"
"upgradeTo": "v1.22.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -104,8 +104,8 @@ async fn binary_quantize_before_sending_documents() {
"manual": {
"embeddings": [
[
-1.0,
-1.0,
0.0,
0.0,
1.0
]
],
@@ -122,7 +122,7 @@ async fn binary_quantize_before_sending_documents() {
[
1.0,
1.0,
-1.0
0.0
]
],
"regenerate": false
@@ -191,8 +191,8 @@ async fn binary_quantize_after_sending_documents() {
"manual": {
"embeddings": [
[
-1.0,
-1.0,
0.0,
0.0,
1.0
]
],
@@ -209,7 +209,7 @@ async fn binary_quantize_after_sending_documents() {
[
1.0,
1.0,
-1.0
0.0
]
],
"regenerate": false
@@ -320,7 +320,7 @@ async fn binary_quantize_clear_documents() {
}
"###);
// Make sure the arroy DB has been cleared
// Make sure the hannoy DB has been cleared
let (documents, _code) =
index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await;
snapshot!(documents, @r#"

View File

@@ -684,7 +684,7 @@ async fn clear_documents() {
}
"###);
// Make sure the arroy DB has been cleared
// Make sure the hannoy DB has been cleared
let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await;
snapshot!(documents, @r#"

View File

@@ -236,7 +236,7 @@ async fn reset_embedder_documents() {
}
"###);
// Make sure the arroy DB has been cleared
// Make sure the hannoy DB has been cleared
let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await;
snapshot!(json_string!(documents), @r###"

View File

@@ -142,8 +142,8 @@ enum Command {
#[derive(Clone, ValueEnum)]
enum IndexPart {
/// Will make the arroy index hot.
Arroy,
/// Will make the hannoy index hot.
Hannoy,
}
fn main() -> anyhow::Result<()> {
@@ -658,12 +658,12 @@ fn hair_dryer(
let rtxn = index.read_txn()?;
for part in index_parts {
match part {
IndexPart::Arroy => {
IndexPart::Hannoy => {
let mut count = 0;
let total = index.vector_arroy.len(&rtxn)?;
eprintln!("Hair drying arroy for {uid}...");
let total = index.vector_store.len(&rtxn)?;
eprintln!("Hair drying hannoy for {uid}...");
for (i, result) in index
.vector_arroy
.vector_store
.remap_types::<Bytes, Bytes>()
.iter(&rtxn)?
.enumerate()

View File

@@ -68,7 +68,7 @@ pub fn v1_10_to_v1_11(
)
})?;
let index_read_database =
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_STORE)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
let mut index_wtxn = index_env.write_txn().with_context(|| {
@@ -79,7 +79,7 @@ pub fn v1_10_to_v1_11(
})?;
let index_write_database =
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_STORE)
.with_context(|| format!("while updating date format for index `{uid}`"))?;
meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5(

View File

@@ -88,6 +88,7 @@ rhai = { version = "1.22.2", features = [
"sync",
] }
arroy = "0.6.1"
hannoy = "0.0.4"
rand = "0.8.5"
tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] }
@@ -95,6 +96,7 @@ url = "2.5.4"
hashbrown = "0.15.4"
bumpalo = "3.18.1"
bumparaw-collections = "0.1.4"
steppe = { version = "0.4.0", default-features = false }
thread_local = "1.1.9"
allocator-api2 = "0.3.0"
rustc-hash = "2.1.1"
@@ -109,6 +111,7 @@ utoipa = { version = "5.4.0", features = [
"openapi_extensions",
] }
lru = "0.14.0"
twox-hash = { version = "2.1.1", default-features = false, features = ["std", "xxhash3_64", "xxhash64"] }
[dev-dependencies]
mimalloc = { version = "0.1.47", default-features = false }

View File

@@ -1,17 +1,13 @@
use crate::{
distance_between_two_points,
heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec},
lat_lng_to_xyz,
search::new::{facet_string_values, facet_values_prefix_key},
GeoPoint, Index,
};
use heed::{
types::{Bytes, Unit},
RoPrefix, RoTxn,
};
use std::collections::VecDeque;
use heed::types::{Bytes, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
use std::collections::VecDeque;
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
use crate::search::new::{facet_string_values, facet_values_prefix_key};
use crate::{distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index};
#[derive(Debug, Clone, Copy)]
pub struct GeoSortParameter {

View File

@@ -1,19 +1,16 @@
use std::collections::{BTreeSet, VecDeque};
use crate::{
constants::RESERVED_GEO_FIELD_NAME,
documents::{geo_sort::next_bucket, GeoSortParameter},
heed_codec::{
facet::{FacetGroupKeyCodec, FacetGroupValueCodec},
BytesRefCodec,
},
is_faceted,
search::facet::{ascending_facet_sort, descending_facet_sort},
AscDesc, DocumentId, Member, UserError,
};
use heed::Database;
use roaring::RoaringBitmap;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::documents::geo_sort::next_bucket;
use crate::documents::GeoSortParameter;
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::search::facet::{ascending_facet_sort, descending_facet_sort};
use crate::{is_faceted, AscDesc, DocumentId, Member, UserError};
#[derive(Debug, Clone, Copy)]
enum AscDescId {
Facet { field_id: u16, ascending: bool },

View File

@@ -78,6 +78,8 @@ pub enum InternalError {
#[error(transparent)]
ArroyError(#[from] arroy::Error),
#[error(transparent)]
HannoyError(#[from] hannoy::Error),
#[error(transparent)]
VectorEmbeddingError(#[from] crate::vector::Error),
}
@@ -441,6 +443,29 @@ impl From<arroy::Error> for Error {
}
}
impl From<hannoy::Error> for Error {
fn from(value: hannoy::Error) -> Self {
match value {
hannoy::Error::Heed(heed) => heed.into(),
hannoy::Error::Io(io) => io.into(),
hannoy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
}
hannoy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
hannoy::Error::DatabaseFull
| hannoy::Error::InvalidItemAppend
| hannoy::Error::UnmatchingDistance { .. }
| hannoy::Error::NeedBuild(_)
| hannoy::Error::MissingKey { .. }
| hannoy::Error::MissingMetadata(_)
| hannoy::Error::UnknownVersion { .. }
| hannoy::Error::CannotDecodeKeyMode { .. } => {
Error::InternalError(InternalError::HannoyError(value))
}
}
}
}
#[derive(Error, Debug)]
pub enum GeoError {
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]

View File

@@ -31,7 +31,7 @@ use crate::prompt::PromptData;
use crate::proximity::ProximityPrecision;
use crate::update::new::StdResult;
use crate::vector::db::IndexEmbeddingConfigs;
use crate::vector::{ArroyStats, ArroyWrapper, Embedding};
use crate::vector::{Embedding, HannoyStats, VectorStore};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
@@ -113,7 +113,7 @@ pub mod db_name {
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
pub const VECTOR_ARROY: &str = "vector-arroy";
pub const VECTOR_STORE: &str = "vector-arroy";
pub const DOCUMENTS: &str = "documents";
}
const NUMBER_OF_DBS: u32 = 25;
@@ -177,10 +177,10 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps an embedder name to its id in the arroy store.
/// Maps an embedder name to its id in the hannoy store.
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
/// Vector store based on arroy™.
pub vector_arroy: arroy::Database<Unspecified>,
/// Vector store based on hannoy™.
pub vector_store: hannoy::Database<Unspecified>,
/// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>,
@@ -237,7 +237,7 @@ impl Index {
// vector stuff
let embedder_category_id =
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
let vector_store = env.create_database(&mut wtxn, Some(VECTOR_STORE))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
@@ -264,7 +264,7 @@ impl Index {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_arroy,
vector_store,
embedder_category_id,
documents,
};
@@ -1769,11 +1769,13 @@ impl Index {
) -> Result<BTreeMap<String, EmbeddingsWithMetadata>> {
let mut res = BTreeMap::new();
let embedders = self.embedding_configs();
let index_version = self.get_version(rtxn)?.unwrap();
for config in embedders.embedding_configs(rtxn)? {
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
let has_fragments = config.config.embedder_options.has_fragments();
let reader = ArroyWrapper::new(
self.vector_arroy,
let reader = VectorStore::new(
index_version,
self.vector_store,
embedder_info.embedder_id,
config.config.quantized(),
);
@@ -1792,13 +1794,18 @@ impl Index {
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
}
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
let mut stats = ArroyStats::default();
pub fn hannoy_stats(&self, rtxn: &RoTxn<'_>) -> Result<HannoyStats> {
let mut stats = HannoyStats::default();
let embedding_configs = self.embedding_configs();
let index_version = self.get_version(rtxn)?.unwrap();
for config in embedding_configs.embedding_configs(rtxn)? {
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
let reader =
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
let reader = VectorStore::new(
index_version,
self.vector_store,
embedder_id,
config.config.quantized(),
);
reader.aggregate_stats(rtxn, &mut stats)?;
}
Ok(stats)
@@ -1842,7 +1849,7 @@ impl Index {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_arroy,
vector_store: vector_hannoy,
embedder_category_id,
documents,
} = self;
@@ -1913,7 +1920,7 @@ impl Index {
"field_id_docid_facet_strings",
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
);
sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?);
sizes.insert("vector_hannoy", vector_hannoy.stat(rtxn).map(compute_size)?);
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);

View File

@@ -53,7 +53,7 @@ pub use search::new::{
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {arroy, charabia as tokenizer, heed, rhai};
pub use {arroy, charabia as tokenizer, hannoy, heed, rhai};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::{AttributePatterns, PatternMatch};

View File

@@ -5,7 +5,6 @@ use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
use std::sync::{Arc, RwLock};
use std::time::{Duration, Instant};
use enum_iterator::Sequence;
use indexmap::IndexMap;
use itertools::Itertools;
use serde::Serialize;
@@ -96,14 +95,6 @@ impl Progress {
durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect()
}
// TODO: ideally we should expose the progress in a way that let arroy use it directly
pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) {
self.update_progress(progress.main);
if let Some(sub) = progress.sub {
self.update_progress(sub);
}
}
}
/// Generate the names associated with the durations and push them.
@@ -277,43 +268,26 @@ impl<U: Send + Sync + 'static> Step for VariableNameStep<U> {
}
}
impl Step for arroy::MainStep {
fn name(&self) -> Cow<'static, str> {
match self {
arroy::MainStep::PreProcessingTheItems => "pre processing the items",
arroy::MainStep::WritingTheDescendantsAndMetadata => {
"writing the descendants and metadata"
}
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
arroy::MainStep::UpdatingTheTrees => "updating the trees",
arroy::MainStep::CreateNewTrees => "create new trees",
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
arroy::MainStep::WriteTheMetadata => "write the metadata",
}
.into()
}
// Integration with steppe
fn current(&self) -> u32 {
*self as u32
}
fn total(&self) -> u32 {
Self::CARDINALITY as u32
impl steppe::Progress for Progress {
fn update(&self, sub_progress: impl steppe::Step) {
self.update_progress(Compat(sub_progress));
}
}
impl Step for arroy::SubStep {
struct Compat<T: steppe::Step>(T);
impl<T: steppe::Step> Step for Compat<T> {
fn name(&self) -> Cow<'static, str> {
self.unit.into()
self.0.name()
}
fn current(&self) -> u32 {
self.current.load(Ordering::Relaxed)
self.0.current().try_into().unwrap_or(u32::MAX)
}
fn total(&self) -> u32 {
self.max
self.0.total().try_into().unwrap_or(u32::MAX)
}
}

View File

@@ -3,7 +3,7 @@ use roaring::{MultiOps, RoaringBitmap};
use crate::error::{DidYouMean, Error};
use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::{ArroyStats, ArroyWrapper};
use crate::vector::{HannoyStats, VectorStore};
use crate::Index;
#[derive(Debug, thiserror::Error)]
@@ -82,6 +82,7 @@ fn evaluate_inner(
embedding_configs: &[IndexEmbeddingConfig],
filter: &VectorFilter<'_>,
) -> crate::Result<RoaringBitmap> {
let index_version = index.get_version(rtxn)?.unwrap();
let embedder_name = embedder.value();
let available_embedders =
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
@@ -96,8 +97,9 @@ fn evaluate_inner(
.embedder_info(rtxn, embedder_name)?
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let arroy_wrapper = ArroyWrapper::new(
index.vector_arroy,
let vector_store = VectorStore::new(
index_version,
index.vector_store,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
@@ -122,7 +124,7 @@ fn evaluate_inner(
})?;
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| {
vector_store.items_in_store(rtxn, fragment_config.id, |bitmap| {
bitmap.clone() - user_provided_docids
})?
}
@@ -132,8 +134,8 @@ fn evaluate_inner(
}
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
let mut stats = HannoyStats::default();
vector_store.aggregate_stats(rtxn, &mut stats)?;
stats.documents - user_provided_docids.clone()
}
VectorFilter::UserProvided => {
@@ -141,14 +143,14 @@ fn evaluate_inner(
user_provided_docids.clone()
}
VectorFilter::Regenerate => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
let mut stats = HannoyStats::default();
vector_store.aggregate_stats(rtxn, &mut stats)?;
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
stats.documents - skip_regenerate
}
VectorFilter::None => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
let mut stats = HannoyStats::default();
vector_store.aggregate_stats(rtxn, &mut stats)?;
stats.documents
}
};

View File

@@ -76,6 +76,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use super::VectorStoreStats;
use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, DistributionShift, Embedder};
use crate::vector::{DistributionShift, Embedder, VectorStore};
use crate::{DocumentId, Result, SearchContext, SearchLogger};
pub struct VectorSort<Q: RankingRuleQueryTrait> {
@@ -56,7 +56,12 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
let target = &self.target;
let before = Instant::now();
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized);
let reader = VectorStore::new(
ctx.index.get_version(ctx.txn)?.unwrap(),
ctx.index.vector_store,
self.embedder_index,
self.quantized,
);
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
self.cached_sorted_docids = results.into_iter();
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {

View File

@@ -3,7 +3,7 @@ use std::sync::Arc;
use roaring::RoaringBitmap;
use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, Embedder};
use crate::vector::{Embedder, VectorStore};
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
pub struct Similar<'a> {
@@ -72,7 +72,12 @@ impl<'a> Similar<'a> {
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
})?;
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
let reader = VectorStore::new(
self.index.get_version(self.rtxn)?.unwrap(),
self.index.vector_store,
embedder_index,
self.quantized,
);
let results = reader.nns_by_item(
self.rtxn,
self.id,

View File

@@ -84,6 +84,7 @@ impl TempIndex {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
@@ -167,6 +168,7 @@ impl TempIndex {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
@@ -242,6 +244,7 @@ fn aborting_indexation() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -2,7 +2,8 @@ use heed::RwTxn;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use crate::{database_stats::DatabaseStats, FieldDistribution, Index, Result};
use crate::database_stats::DatabaseStats;
use crate::{FieldDistribution, Index, Result};
pub struct ClearDocuments<'t, 'i> {
wtxn: &'t mut RwTxn<'i>,
@@ -45,7 +46,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_arroy,
vector_store,
embedder_category_id: _,
documents,
} = self.index;
@@ -88,7 +89,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
// vector
vector_arroy.clear(self.wtxn)?;
vector_store.clear(self.wtxn)?;
documents.clear(self.wtxn)?;

View File

@@ -2,9 +2,8 @@ use std::collections::BTreeSet;
use std::fs::File;
use std::io::{self, BufReader};
use heed::{BytesDecode, BytesEncode};
use heed::BytesDecode;
use obkv::KvReaderU16;
use roaring::RoaringBitmap;
use super::helpers::{
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
@@ -16,7 +15,7 @@ use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::sorter_into_reader;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
use crate::{DocumentId, FieldId, Result};
/// Extracts the word and the documents ids where this word appear.
///
@@ -201,45 +200,3 @@ fn words_into_sorter(
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn docids_into_writers<W>(
word: &str,
deletions: &RoaringBitmap,
additions: &RoaringBitmap,
writer: &mut grenad::Writer<W>,
) -> Result<()>
where
W: std::io::Write,
{
if deletions == additions {
// if the same value is deleted and added, do nothing.
return Ok(());
}
// Write each value in the same KvDelAdd before inserting it in the final writer.
let mut obkv = KvWriterDelAdd::memory();
// deletions:
if !deletions.is_empty() && !deletions.is_subset(additions) {
obkv.insert(
DelAdd::Deletion,
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
)?;
}
// additions:
if !additions.is_empty() {
obkv.insert(
DelAdd::Addition,
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
)?;
}
// insert everything in the same writer.
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
Ok(())
}

View File

@@ -39,7 +39,7 @@ use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::vector::db::EmbedderInfo;
use crate::vector::{ArroyWrapper, RuntimeEmbedders};
use crate::vector::{RuntimeEmbedders, VectorStore};
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
static MERGED_DATABASE_COUNT: usize = 7;
@@ -485,6 +485,7 @@ where
// If an embedder wasn't used in the typedchunk but must be binary quantized
// we should insert it in `dimension`
let index_version = self.index.get_version(self.wtxn)?.unwrap();
for (name, action) in settings_diff.embedding_config_updates.iter() {
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or(
@@ -493,8 +494,12 @@ where
key: None,
},
)?;
let reader =
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
let reader = VectorStore::new(
index_version,
self.index.vector_store,
index,
action.was_quantized,
);
let Some(dim) = reader.dimensions(self.wtxn)? else {
continue;
};
@@ -504,7 +509,7 @@ where
for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn;
let vector_arroy = self.index.vector_arroy;
let vector_hannoy = self.index.vector_store;
let cancel = &self.should_abort;
let embedder_index =
@@ -523,11 +528,12 @@ where
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
pool.install(|| {
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
let mut writer =
VectorStore::new(index_version, vector_hannoy, embedder_index, was_quantized);
writer.build_and_quantize(
wtxn,
// In the settings we don't have any progress to share
&Progress::default(),
Progress::default(),
&mut rng,
dimension,
is_quantizing,
@@ -1977,6 +1983,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2029,6 +2036,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2117,6 +2125,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2306,6 +2315,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2369,6 +2379,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2423,6 +2434,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2476,6 +2488,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2531,6 +2544,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2591,6 +2605,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2644,6 +2659,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2697,6 +2713,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2908,6 +2925,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -2968,6 +2986,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();
@@ -3025,6 +3044,7 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -32,7 +32,7 @@ use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableIds, UpdateIndexingStep};
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
use crate::vector::ArroyWrapper;
use crate::vector::VectorStore;
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
pub struct TransformOutput {
@@ -834,15 +834,17 @@ impl<'a, 'i> Transform<'a, 'i> {
None
};
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff
let index_version = self.index.get_version(wtxn)?.unwrap();
let readers: BTreeMap<&str, (VectorStore, &RoaringBitmap)> = settings_diff
.embedding_config_updates
.iter()
.filter_map(|(name, action)| {
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
action.write_back()
{
let reader = ArroyWrapper::new(
self.index.vector_arroy,
let reader = VectorStore::new(
index_version,
self.index.vector_store,
*embedder_id,
action.was_quantized,
);
@@ -882,10 +884,7 @@ impl<'a, 'i> Transform<'a, 'i> {
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
arroy::Error,
> = readers
let injected_vectors: crate::Result<_> = readers
.iter()
.filter_map(|(name, (reader, user_provided))| {
if !user_provided.contains(docid) {
@@ -949,9 +948,13 @@ impl<'a, 'i> Transform<'a, 'i> {
else {
continue;
};
let arroy =
ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized);
let Some(dimensions) = arroy.dimensions(wtxn)? else {
let hannoy = VectorStore::new(
index_version,
self.index.vector_store,
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = hannoy.dimensions(wtxn)? else {
continue;
};
for fragment_id in fragment_ids {
@@ -959,17 +962,17 @@ impl<'a, 'i> Transform<'a, 'i> {
if infos.embedding_status.user_provided_docids().is_empty() {
// no user provided: clear store
arroy.clear_store(wtxn, *fragment_id, dimensions)?;
hannoy.clear_store(wtxn, *fragment_id, dimensions)?;
continue;
}
// some user provided, remove only the ids that are not user provided
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| {
let to_delete = hannoy.items_in_store(wtxn, *fragment_id, |items| {
items - infos.embedding_status.user_provided_docids()
})?;
for to_delete in to_delete {
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
hannoy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
}
}
}

View File

@@ -27,7 +27,7 @@ use crate::update::index_documents::helpers::{
};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
use crate::vector::ArroyWrapper;
use crate::vector::VectorStore;
use crate::{
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
Result, SerializationError, U8StrStrCodec,
@@ -619,6 +619,7 @@ pub(crate) fn write_typed_chunk_into_index(
let _entered = span.enter();
let embedders = index.embedding_configs();
let index_version = index.get_version(wtxn)?.unwrap();
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
@@ -677,7 +678,12 @@ pub(crate) fn write_typed_chunk_into_index(
.get(&embedder_name)
.is_some_and(|conf| conf.is_quantized);
// FIXME: allow customizing distance
let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized);
let writer = VectorStore::new(
index_version,
index.vector_store,
infos.embedder_id,
binary_quantized,
);
// remove vectors for docids we want them removed
let merger = remove_vectors_builder.build();

View File

@@ -1,7 +1,8 @@
use grenad::CompressionType;
use super::GrenadParameters;
use crate::{thread_pool_no_abort::ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
use crate::ThreadPoolNoAbortBuilder;
#[derive(Debug)]
pub struct IndexerConfig {

View File

@@ -255,9 +255,9 @@ impl<'a> From<FrameGrantR<'a>> for FrameWithHeader<'a> {
#[repr(u8)]
pub enum EntryHeader {
DbOperation(DbOperation),
ArroyDeleteVector(ArroyDeleteVector),
ArroySetVectors(ArroySetVectors),
ArroySetVector(ArroySetVector),
HannoyDeleteVector(HannoyDeleteVector),
HannoySetVectors(HannoySetVectors),
HannoySetVector(HannoySetVector),
}
impl EntryHeader {
@@ -268,9 +268,9 @@ impl EntryHeader {
const fn variant_id(&self) -> u8 {
match self {
EntryHeader::DbOperation(_) => 0,
EntryHeader::ArroyDeleteVector(_) => 1,
EntryHeader::ArroySetVectors(_) => 2,
EntryHeader::ArroySetVector(_) => 3,
EntryHeader::HannoyDeleteVector(_) => 1,
EntryHeader::HannoySetVectors(_) => 2,
EntryHeader::HannoySetVector(_) => 3,
}
}
@@ -286,26 +286,26 @@ impl EntryHeader {
}
const fn total_delete_vector_size() -> usize {
Self::variant_size() + mem::size_of::<ArroyDeleteVector>()
Self::variant_size() + mem::size_of::<HannoyDeleteVector>()
}
/// The `dimensions` corresponds to the number of `f32` in the embedding.
fn total_set_vectors_size(count: usize, dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count
Self::variant_size() + mem::size_of::<HannoySetVectors>() + embedding_size * count
}
fn total_set_vector_size(dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size
Self::variant_size() + mem::size_of::<HannoySetVector>() + embedding_size
}
fn header_size(&self) -> usize {
let payload_size = match self {
EntryHeader::DbOperation(op) => mem::size_of_val(op),
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv),
EntryHeader::HannoyDeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::HannoySetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::HannoySetVector(asv) => mem::size_of_val(asv),
};
Self::variant_size() + payload_size
}
@@ -319,19 +319,19 @@ impl EntryHeader {
EntryHeader::DbOperation(header)
}
1 => {
let header_bytes = &remaining[..mem::size_of::<ArroyDeleteVector>()];
let header_bytes = &remaining[..mem::size_of::<HannoyDeleteVector>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroyDeleteVector(header)
EntryHeader::HannoyDeleteVector(header)
}
2 => {
let header_bytes = &remaining[..mem::size_of::<ArroySetVectors>()];
let header_bytes = &remaining[..mem::size_of::<HannoySetVectors>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVectors(header)
EntryHeader::HannoySetVectors(header)
}
3 => {
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()];
let header_bytes = &remaining[..mem::size_of::<HannoySetVector>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVector(header)
EntryHeader::HannoySetVector(header)
}
id => panic!("invalid variant id: {id}"),
}
@@ -341,9 +341,9 @@ impl EntryHeader {
let (first, remaining) = header_bytes.split_first_mut().unwrap();
let payload_bytes = match self {
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv),
EntryHeader::HannoyDeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::HannoySetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::HannoySetVector(asv) => bytemuck::bytes_of(asv),
};
*first = self.variant_id();
remaining.copy_from_slice(payload_bytes);
@@ -378,7 +378,7 @@ impl DbOperation {
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
#[repr(transparent)]
pub struct ArroyDeleteVector {
pub struct HannoyDeleteVector {
pub docid: DocumentId,
}
@@ -386,13 +386,13 @@ pub struct ArroyDeleteVector {
#[repr(C)]
/// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s.
pub struct ArroySetVectors {
pub struct HannoySetVectors {
pub docid: DocumentId,
pub embedder_id: u8,
_padding: [u8; 3],
}
impl ArroySetVectors {
impl HannoySetVectors {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..]
@@ -416,14 +416,14 @@ impl ArroySetVectors {
#[repr(C)]
/// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s.
pub struct ArroySetVector {
pub struct HannoySetVector {
pub docid: DocumentId,
pub embedder_id: u8,
pub extractor_id: u8,
_padding: [u8; 2],
}
impl ArroySetVector {
impl HannoySetVector {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..]
@@ -553,7 +553,7 @@ impl<'b> ExtractorBbqueueSender<'b> {
let refcell = self.producers.get().unwrap();
let mut producer = refcell.0.borrow_mut_or_yield();
let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid });
let payload_header = EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid });
let total_length = EntryHeader::total_delete_vector_size();
if total_length > max_grant {
panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)");
@@ -589,8 +589,8 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all
let dimensions = embeddings.first().map_or(0, |emb| emb.len());
let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] };
let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector);
let hannoy_set_vector = HannoySetVectors { docid, embedder_id, _padding: [0; 3] };
let payload_header = EntryHeader::HannoySetVectors(hannoy_set_vector);
let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions);
if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
@@ -650,9 +650,9 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
let arroy_set_vector =
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector);
let hannoy_set_vector =
HannoySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::HannoySetVector(hannoy_set_vector);
let total_length = EntryHeader::total_set_vector_size(dimensions);
if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;

View File

@@ -240,12 +240,12 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE
/// modifies them by adding or removing vector fields based on embedder actions,
/// and then updates the database.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents::extract")]
pub fn update_database_documents<'indexer, 'extractor, MSP, SD>(
pub fn update_database_documents<'indexer, MSP, SD>(
documents: &'indexer DocumentsIndentifiers<'indexer>,
indexing_context: IndexingContext<MSP>,
extractor_sender: &ExtractorBbqueueSender,
settings_delta: &SD,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>,
) -> Result<()>
where
MSP: Fn() -> bool + Sync,

View File

@@ -17,6 +17,7 @@ use super::guess_primary_key::retrieve_or_guess_primary_key;
use crate::documents::PrimaryKey;
use crate::progress::{AtomicPayloadStep, Progress};
use crate::update::new::document::{DocumentContext, Versions};
use crate::update::new::indexer::sharding::Shards;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::MostlySend;
use crate::update::new::{DocumentIdentifiers, Insertion, Update};
@@ -71,6 +72,7 @@ impl<'pl> DocumentOperation<'pl> {
new_fields_ids_map: &mut FieldsIdsMap,
must_stop_processing: &MSP,
progress: Progress,
shards: Option<&Shards>,
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
where
MSP: Fn() -> bool,
@@ -107,6 +109,7 @@ impl<'pl> DocumentOperation<'pl> {
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::ReplaceDocuments,
shards,
payload,
),
Payload::Update(payload) => extract_addition_payload_changes(
@@ -120,6 +123,7 @@ impl<'pl> DocumentOperation<'pl> {
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::UpdateDocuments,
shards,
payload,
),
Payload::Deletion(to_delete) => extract_deletion_payload_changes(
@@ -127,6 +131,7 @@ impl<'pl> DocumentOperation<'pl> {
rtxn,
&mut available_docids,
&docids_version_offsets,
shards,
to_delete,
),
};
@@ -173,6 +178,7 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
bytes: &mut u64,
main_docids_version_offsets: &hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>,
method: IndexDocumentsMethod,
shards: Option<&Shards>,
payload: &'pl [u8],
) -> Result<hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>> {
use IndexDocumentsMethod::{ReplaceDocuments, UpdateDocuments};
@@ -210,12 +216,20 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
primary_key.as_ref().unwrap()
};
let current_offset = iter.byte_offset();
let content = &payload[previous_offset..current_offset];
previous_offset = current_offset;
let external_id =
retrieved_primary_key.extract_fields_and_docid(doc, new_fields_ids_map, indexer)?;
let external_id = external_id.to_de();
let current_offset = iter.byte_offset();
let document_offset = DocumentOffset { content: &payload[previous_offset..current_offset] };
if shards.is_some_and(|shards| !shards.must_process(external_id)) {
continue;
}
let document_offset = DocumentOffset { content };
match main_docids_version_offsets.get(external_id) {
None => {
@@ -299,8 +313,6 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
},
},
}
previous_offset = iter.byte_offset();
}
if payload.is_empty() {
@@ -329,11 +341,16 @@ fn extract_deletion_payload_changes<'s, 'pl: 's>(
rtxn: &RoTxn,
available_docids: &mut AvailableIds,
main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>,
shards: Option<&Shards>,
to_delete: &'pl [&'pl str],
) -> Result<hashbrown::HashMap<&'s str, PayloadOperations<'pl>>> {
let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new();
for external_id in to_delete {
if shards.is_some_and(|shards| !shards.must_process(external_id)) {
continue;
}
match main_docids_version_offsets.get(external_id) {
None => {
match index.external_documents_ids().get(rtxn, external_id) {

View File

@@ -8,7 +8,7 @@ use document_changes::{DocumentChanges, IndexingContext};
pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap;
use heed::RwTxn;
use heed::{RoTxn, RwTxn};
pub use partial_dump::PartialDump;
pub use post_processing::recompute_word_fst_from_word_docids_database;
pub use update_by_function::UpdateByFunction;
@@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
use crate::update::settings::SettingsDelta;
use crate::update::GrenadParameters;
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
use crate::vector::{Embedder, RuntimeEmbedders, VectorStore};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
pub(crate) mod de;
@@ -36,6 +36,7 @@ mod guess_primary_key;
mod partial_dump;
mod post_processing;
pub mod settings_changes;
pub mod sharding;
mod update_by_function;
mod write;
@@ -66,7 +67,7 @@ where
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
let arroy_memory = grenad_parameters.max_memory;
let hannoy_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@@ -129,8 +130,9 @@ where
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let vector_arroy = index.vector_arroy;
let arroy_writers: Result<HashMap<_, _>> = embedders
let vector_arroy = index.vector_store;
let index_version = index.get_version(wtxn)?.unwrap();
let hannoy_writers: Result<HashMap<_, _>> = embedders
.inner_as_ref()
.iter()
.map(|(embedder_name, runtime)| {
@@ -143,7 +145,12 @@ where
})?;
let dimensions = runtime.embedder.dimensions();
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
let writer = VectorStore::new(
index_version,
vector_arroy,
embedder_index,
runtime.is_quantized,
);
Ok((
embedder_index,
@@ -152,10 +159,10 @@ where
})
.collect();
let mut arroy_writers = arroy_writers?;
let mut hannoy_writers = hannoy_writers?;
let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?;
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@@ -169,8 +176,8 @@ where
wtxn,
indexing_context.progress,
index_embeddings,
arroy_memory,
&mut arroy_writers,
hannoy_memory,
&mut hannoy_writers,
None,
&indexing_context.must_stop_processing,
)
@@ -226,7 +233,7 @@ where
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
let arroy_memory = grenad_parameters.max_memory;
let hannoy_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@@ -283,15 +290,16 @@ where
let new_embedders = settings_delta.new_embedders();
let embedder_actions = settings_delta.embedder_actions();
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
let mut arroy_writers = arroy_writers_from_embedder_actions(
let mut hannoy_writers = hannoy_writers_from_embedder_actions(
index,
wtxn,
embedder_actions,
new_embedders,
index_embedder_category_ids,
)?;
let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?;
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@@ -305,8 +313,8 @@ where
wtxn,
indexing_context.progress,
index_embeddings,
arroy_memory,
&mut arroy_writers,
hannoy_memory,
&mut hannoy_writers,
Some(embedder_actions),
&indexing_context.must_stop_processing,
)
@@ -336,13 +344,15 @@ where
Ok(congestion)
}
fn arroy_writers_from_embedder_actions<'indexer>(
fn hannoy_writers_from_embedder_actions<'indexer>(
index: &Index,
rtxn: &RoTxn,
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
embedders: &'indexer RuntimeEmbedders,
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, ArroyWrapper, usize)>> {
let vector_arroy = index.vector_arroy;
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, VectorStore, usize)>> {
let vector_arroy = index.vector_store;
let index_version = index.get_version(rtxn)?.unwrap();
embedders
.inner_as_ref()
@@ -360,8 +370,12 @@ fn arroy_writers_from_embedder_actions<'indexer>(
},
)));
};
let writer =
ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized);
let writer = VectorStore::new(
index_version,
vector_arroy,
embedder_category_id,
action.was_quantized,
);
let dimensions = runtime.embedder.dimensions();
Some(Ok((
embedder_category_id,
@@ -384,7 +398,12 @@ where
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
continue;
};
let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized);
let reader = VectorStore::new(
index.get_version(wtxn)?.unwrap(),
index.vector_store,
*embedder_id,
action.was_quantized,
);
let Some(dimensions) = reader.dimensions(wtxn)? else {
continue;
};
@@ -400,7 +419,12 @@ where
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
continue;
};
let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized);
let arroy = VectorStore::new(
index.get_version(wtxn)?.unwrap(),
index.vector_store,
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = arroy.dimensions(wtxn)? else {
continue;
};

View File

@@ -0,0 +1,22 @@
// Copyright © 2025 Meilisearch Some Rights Reserved
// This file is part of Meilisearch Enterprise Edition (EE).
// Use of this source code is governed by the Business Source License 1.1,
// as found in the LICENSE-EE file or at <https://mariadb.com/bsl11>
use std::hash::{BuildHasher as _, BuildHasherDefault};
pub struct Shards {
pub own: Vec<String>,
pub others: Vec<String>,
}
impl Shards {
pub fn must_process(&self, docid: &str) -> bool {
let hasher = BuildHasherDefault::<twox_hash::XxHash3_64>::new();
let to_hash = |shard: &String| hasher.hash_one((shard, docid));
let max_hash = self.others.iter().map(to_hash).max().unwrap_or_default();
self.own.iter().map(to_hash).any(|hash| hash > max_hash)
}
}

View File

@@ -15,7 +15,7 @@ use crate::progress::Progress;
use crate::update::settings::InnerIndexSettings;
use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::settings::EmbedderAction;
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
use crate::vector::{Embedder, Embeddings, RuntimeEmbedders, VectorStore};
use crate::{Error, Index, InternalError, Result, UserError};
pub fn write_to_db(
@@ -23,9 +23,9 @@ pub fn write_to_db(
finished_extraction: &AtomicBool,
index: &Index,
wtxn: &mut RwTxn<'_>,
arroy_writers: &HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
hannoy_writers: &HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
) -> Result<ChannelCongestion> {
// Used by by the ArroySetVector to copy the embedding into an
// Used by by the HannoySetVector to copy the embedding into an
// aligned memory area, required by arroy to accept a new vector.
let mut aligned_embedding = Vec::new();
let span = tracing::trace_span!(target: "indexing::write_db", "all");
@@ -56,7 +56,7 @@ pub fn write_to_db(
ReceiverAction::LargeVectors(large_vectors) => {
let LargeVectors { docid, embedder_id, .. } = large_vectors;
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions);
for embedding in large_vectors.read_embeddings(*dimensions) {
embeddings.push(embedding.to_vec()).unwrap();
@@ -68,7 +68,7 @@ pub fn write_to_db(
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
) => {
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = large_vector.read_embedding(*dimensions);
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
}
@@ -80,12 +80,12 @@ pub fn write_to_db(
&mut writer_receiver,
index,
wtxn,
arroy_writers,
hannoy_writers,
&mut aligned_embedding,
)?;
}
write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?;
write_from_bbqueue(&mut writer_receiver, index, wtxn, hannoy_writers, &mut aligned_embedding)?;
Ok(ChannelCongestion {
attempts: writer_receiver.sent_messages_attempts(),
@@ -115,8 +115,8 @@ pub fn build_vectors<MSP>(
wtxn: &mut RwTxn<'_>,
progress: &Progress,
index_embeddings: Vec<IndexEmbeddingConfig>,
arroy_memory: Option<usize>,
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
hannoy_memory: Option<usize>,
hannoy_writers: &mut HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
must_stop_processing: &MSP,
) -> Result<()>
@@ -129,18 +129,18 @@ where
let seed = rand::random();
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers {
for (_index, (embedder_name, _embedder, writer, dimensions)) in hannoy_writers {
let dimensions = *dimensions;
let is_being_quantized = embeder_actions
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
.unwrap_or(false);
writer.build_and_quantize(
wtxn,
progress,
progress.clone(),
&mut rng,
dimensions,
is_being_quantized,
arroy_memory,
hannoy_memory,
must_stop_processing,
)?;
}
@@ -181,7 +181,7 @@ pub fn write_from_bbqueue(
writer_receiver: &mut WriterBbqueueReceiver<'_>,
index: &Index,
wtxn: &mut RwTxn<'_>,
arroy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, ArroyWrapper, usize)>,
hannoy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, VectorStore, usize)>,
aligned_embedding: &mut Vec<f32>,
) -> crate::Result<()> {
while let Some(frame_with_header) = writer_receiver.recv_frame() {
@@ -221,17 +221,17 @@ pub fn write_from_bbqueue(
},
}
}
EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => {
for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers {
EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid }) => {
for (_index, (_name, _embedder, writer, dimensions)) in hannoy_writers {
let dimensions = *dimensions;
writer.del_items(wtxn, dimensions, docid)?;
}
}
EntryHeader::ArroySetVectors(asvs) => {
let ArroySetVectors { docid, embedder_id, .. } = asvs;
EntryHeader::HannoySetVectors(asvs) => {
let HannoySetVectors { docid, embedder_id, .. } = asvs;
let frame = frame_with_header.frame();
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions);
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
writer.del_items(wtxn, *dimensions, docid)?;
@@ -245,12 +245,12 @@ pub fn write_from_bbqueue(
writer.add_items(wtxn, docid, &embeddings)?;
}
}
EntryHeader::ArroySetVector(
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
EntryHeader::HannoySetVector(
asv @ HannoySetVector { docid, embedder_id, extractor_id, .. },
) => {
let frame = frame_with_header.frame();
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
if embedding.is_empty() {

View File

@@ -63,8 +63,8 @@ where
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_docids<'extractor, MSP, D>(
mut caches: Vec<BalancedCaches<'extractor>>,
pub fn merge_and_send_docids<MSP, D>(
mut caches: Vec<BalancedCaches<'_>>,
database: Database<Bytes, Bytes>,
index: &Index,
docids_sender: WordDocidsSender<D>,
@@ -91,8 +91,8 @@ where
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_facet_docids<'extractor>(
mut caches: Vec<BalancedCaches<'extractor>>,
pub fn merge_and_send_facet_docids(
mut caches: Vec<BalancedCaches<'_>>,
database: FacetDatabases,
index: &Index,
rtxn: &RoTxn,

View File

@@ -14,7 +14,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::documents::FieldIdMapper;
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
use crate::vector::{Embedding, RuntimeEmbedders, VectorStore};
use crate::{DocumentId, Index, InternalError, Result, UserError};
#[derive(Serialize)]
@@ -120,8 +120,13 @@ impl<'t> VectorDocumentFromDb<'t> {
config: &IndexEmbeddingConfig,
status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> {
let reader =
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
let index_version = self.index.get_version(self.rtxn)?.unwrap();
let reader = VectorStore::new(
index_version,
self.index.vector_store,
embedder_id,
config.config.quantized(),
);
let vectors = reader.item_vectors(self.rtxn, self.docid)?;
Ok(VectorEntry {
@@ -149,7 +154,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
name,
entry_from_raw_value(value, false).map_err(|_| {
InternalError::Serialization(crate::SerializationError::Decoding {
db_name: Some(crate::index::db_name::VECTOR_ARROY),
db_name: Some(crate::index::db_name::VECTOR_STORE),
})
})?,
))
@@ -167,7 +172,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
Some(embedding_from_doc) => {
Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| {
InternalError::Serialization(crate::SerializationError::Decoding {
db_name: Some(crate::index::db_name::VECTOR_ARROY),
db_name: Some(crate::index::db_name::VECTOR_STORE),
})
})?)
}

View File

@@ -1,17 +1,20 @@
mod new_hannoy;
mod v1_12;
mod v1_13;
mod v1_14;
mod v1_15;
mod v1_16;
use heed::RwTxn;
use new_hannoy::Latest_V1_18_New_Hannoy;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
use v1_14::Latest_V1_13_To_Latest_V1_14;
use v1_15::Latest_V1_14_To_Latest_V1_15;
use v1_16::Latest_V1_15_To_V1_16_0;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::{Progress, VariableNameStep};
use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0;
use crate::{Index, InternalError, Result};
trait UpgradeIndex {
@@ -34,6 +37,8 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[
&Latest_V1_13_To_Latest_V1_14 {},
&Latest_V1_14_To_Latest_V1_15 {},
&Latest_V1_15_To_V1_16_0 {},
&ToTargetNoOp { target: (1, 18, 0) },
&Latest_V1_18_New_Hannoy {},
// This is the last upgrade function, it will be called when the index is up to date.
// any other upgrade function should be added before this one.
&ToCurrentNoOp {},
@@ -61,9 +66,9 @@ const fn start(from: (u32, u32, u32)) -> Option<usize> {
(1, 14, _) => function_index!(5),
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 15, _) => function_index!(6),
(1, 16, _) => function_index!(7),
(1, 17, _) => function_index!(7),
(1, 18, _) => function_index!(7),
(1, 16, _) | (1, 17, _) => function_index!(7),
(1, 18, _) => function_index!(8),
(1, 22, _) => function_index!(9),
// We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually
// considering dumpless upgrade.
(_major, _minor, _patch) => return None,
@@ -146,3 +151,25 @@ impl UpgradeIndex for ToCurrentNoOp {
(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
}
}
/// Perform no operation during the upgrade except changing to the specified target version.
#[allow(non_camel_case_types)]
struct ToTargetNoOp {
pub target: (u32, u32, u32),
}
impl UpgradeIndex for ToTargetNoOp {
fn upgrade(
&self,
_wtxn: &mut RwTxn,
_index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
self.target
}
}

View File

@@ -0,0 +1,36 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::progress::Progress;
use crate::vector::VectorStore;
use crate::{Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct Latest_V1_18_New_Hannoy();
impl UpgradeIndex for Latest_V1_18_New_Hannoy {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
progress: Progress,
) -> Result<bool> {
let embedding_configs = index.embedding_configs();
let index_version = index.get_version(wtxn)?.unwrap();
for config in embedding_configs.embedding_configs(wtxn)? {
// TODO use the embedder name to display progress
let quantized = config.config.quantized();
let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap();
let vector_store =
VectorStore::new(index_version, index.vector_store, embedder_id, quantized);
vector_store.convert_from_arroy(wtxn, progress.clone())?;
}
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 22, 0)
}
}

View File

@@ -27,9 +27,9 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 {
let rtxn = index.read_txn()?;
arroy::upgrade::from_0_5_to_0_6::<Cosine>(
&rtxn,
index.vector_arroy.remap_data_type(),
index.vector_store.remap_types(),
wtxn,
index.vector_arroy.remap_data_type(),
index.vector_store.remap_types(),
)?;
Ok(false)

View File

@@ -1,6 +1,6 @@
use std::time::Instant;
use arroy::Distance;
use hannoy::Distance;
use super::error::CompositeEmbedderContainsHuggingFace;
use super::{
@@ -324,19 +324,18 @@ fn check_similarity(
}
for (left, right) in left.into_iter().zip(right) {
let left = arroy::internals::UnalignedVector::from_slice(&left);
let right = arroy::internals::UnalignedVector::from_slice(&right);
let left = arroy::internals::Leaf {
header: arroy::distances::Cosine::new_header(&left),
let left = hannoy::internals::UnalignedVector::from_slice(&left);
let right = hannoy::internals::UnalignedVector::from_slice(&right);
let left = hannoy::internals::Item {
header: hannoy::distances::Cosine::new_header(&left),
vector: left,
};
let right = arroy::internals::Leaf {
header: arroy::distances::Cosine::new_header(&right),
let right = hannoy::internals::Item {
header: hannoy::distances::Cosine::new_header(&right),
vector: right,
};
let distance = arroy::distances::Cosine::built_distance(&left, &right);
let distance = hannoy::distances::Cosine::distance(&left, &right);
if distance > super::MAX_COMPOSITE_DISTANCE {
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
}

View File

@@ -3,11 +3,12 @@ use std::num::NonZeroUsize;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use arroy::distances::{BinaryQuantizedCosine, Cosine};
use arroy::ItemId;
use deserr::{DeserializeError, Deserr};
use hannoy::distances::{Cosine, Hamming};
use hannoy::ItemId;
use heed::{RoTxn, RwTxn, Unspecified};
use ordered_float::OrderedFloat;
use rand::SeedableRng as _;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
@@ -41,31 +42,43 @@ pub type Embedding = Vec<f32>;
pub const REQUEST_PARALLELISM: usize = 40;
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
pub struct ArroyWrapper {
quantized: bool,
const HANNOY_EF_CONSTRUCTION: usize = 125;
const HANNOY_M: usize = 16;
const HANNOY_M0: usize = 32;
pub struct VectorStore {
version: (u32, u32, u32),
database: hannoy::Database<Unspecified>,
embedder_index: u8,
database: arroy::Database<Unspecified>,
quantized: bool,
}
impl ArroyWrapper {
impl VectorStore {
pub fn new(
database: arroy::Database<Unspecified>,
version: (u32, u32, u32),
database: hannoy::Database<Unspecified>,
embedder_index: u8,
quantized: bool,
) -> Self {
Self { database, embedder_index, quantized }
Self { version, database, embedder_index, quantized }
}
pub fn embedder_index(&self) -> u8 {
self.embedder_index
}
fn readers<'a, D: arroy::Distance>(
/// Whether we must use the arroy to read the vector store.
pub fn version_uses_arroy(&self) -> bool {
let (major, minor, _patch) = self.version;
major == 1 && minor < 18
}
fn arroy_readers<'a, D: arroy::Distance>(
&'a self,
rtxn: &'a RoTxn<'a>,
db: arroy::Database<D>,
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a {
arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
match arroy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)),
@@ -78,6 +91,24 @@ impl ArroyWrapper {
})
}
fn readers<'a, D: hannoy::Distance>(
&'a self,
rtxn: &'a RoTxn<'a>,
db: hannoy::Database<D>,
) -> impl Iterator<Item = Result<hannoy::Reader<'a, D>, hannoy::Error>> + 'a {
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
match hannoy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)),
Ok(true) => None,
Err(e) => Some(Err(e)),
},
Err(hannoy::Error::MissingMetadata(_)) => None,
Err(e) => Some(Err(e)),
}
})
}
/// The item ids that are present in the store specified by its id.
///
/// The ids are accessed via a lambda to avoid lifetime shenanigans.
@@ -86,18 +117,27 @@ impl ArroyWrapper {
rtxn: &RoTxn,
store_id: u8,
with_items: F,
) -> Result<O, arroy::Error>
) -> crate::Result<O>
where
F: FnOnce(&RoaringBitmap) -> O,
{
if self.quantized {
if self.version_uses_arroy() {
if self.quantized {
self._arroy_items_in_store(rtxn, self.arroy_quantized_db(), store_id, with_items)
.map_err(Into::into)
} else {
self._arroy_items_in_store(rtxn, self.arroy_angular_db(), store_id, with_items)
.map_err(Into::into)
}
} else if self.quantized {
self._items_in_store(rtxn, self.quantized_db(), store_id, with_items)
.map_err(Into::into)
} else {
self._items_in_store(rtxn, self.angular_db(), store_id, with_items)
self._items_in_store(rtxn, self.angular_db(), store_id, with_items).map_err(Into::into)
}
}
fn _items_in_store<D: arroy::Distance, F, O>(
fn _arroy_items_in_store<D: arroy::Distance, F, O>(
&self,
rtxn: &RoTxn,
db: arroy::Database<D>,
@@ -107,7 +147,7 @@ impl ArroyWrapper {
where
F: FnOnce(&RoaringBitmap) -> O,
{
let index = arroy_store_for_embedder(self.embedder_index, store_id);
let index = vector_store_for_embedder(self.embedder_index, store_id);
let reader = arroy::Reader::open(rtxn, index, db);
match reader {
Ok(reader) => Ok(with_items(reader.item_ids())),
@@ -116,8 +156,41 @@ impl ArroyWrapper {
}
}
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, arroy::Error> {
if self.quantized {
fn _items_in_store<D: hannoy::Distance, F, O>(
&self,
rtxn: &RoTxn,
db: hannoy::Database<D>,
store_id: u8,
with_items: F,
) -> Result<O, hannoy::Error>
where
F: FnOnce(&RoaringBitmap) -> O,
{
let index = vector_store_for_embedder(self.embedder_index, store_id);
let reader = hannoy::Reader::open(rtxn, index, db);
match reader {
Ok(reader) => Ok(with_items(reader.item_ids())),
Err(hannoy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())),
Err(err) => Err(err),
}
}
pub fn dimensions(&self, rtxn: &RoTxn) -> crate::Result<Option<usize>> {
if self.version_uses_arroy() {
if self.quantized {
Ok(self
.arroy_readers(rtxn, self.arroy_quantized_db())
.next()
.transpose()?
.map(|reader| reader.dimensions()))
} else {
Ok(self
.arroy_readers(rtxn, self.arroy_angular_db())
.next()
.transpose()?
.map(|reader| reader.dimensions()))
}
} else if self.quantized {
Ok(self
.readers(rtxn, self.quantized_db())
.next()
@@ -132,47 +205,92 @@ impl ArroyWrapper {
}
}
pub fn convert_from_arroy(&self, wtxn: &mut RwTxn, progress: Progress) -> crate::Result<()> {
if self.quantized {
let dimensions = self
.arroy_readers(wtxn, self.arroy_quantized_db())
.next()
.transpose()?
.map(|reader| reader.dimensions());
let Some(dimensions) = dimensions else { return Ok(()) };
for index in vector_store_range_for_embedder(self.embedder_index) {
let mut rng = rand::rngs::StdRng::from_entropy();
let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions);
let mut builder = writer.builder(&mut rng).progress(progress.clone());
builder.prepare_arroy_conversion(wtxn)?;
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
}
Ok(())
} else {
let dimensions = self
.arroy_readers(wtxn, self.arroy_angular_db())
.next()
.transpose()?
.map(|reader| reader.dimensions());
let Some(dimensions) = dimensions else { return Ok(()) };
for index in vector_store_range_for_embedder(self.embedder_index) {
let mut rng = rand::rngs::StdRng::from_entropy();
let writer = hannoy::Writer::new(self.angular_db(), index, dimensions);
let mut builder = writer.builder(&mut rng).progress(progress.clone());
builder.prepare_arroy_conversion(wtxn)?;
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
}
Ok(())
}
}
#[allow(clippy::too_many_arguments)]
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
&mut self,
wtxn: &mut RwTxn,
progress: &Progress,
progress: Progress,
rng: &mut R,
dimension: usize,
quantizing: bool,
arroy_memory: Option<usize>,
hannoy_memory: Option<usize>,
cancel: &(impl Fn() -> bool + Sync + Send),
) -> Result<(), arroy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) {
) -> Result<(), hannoy::Error> {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.need_build(wtxn)? {
writer.builder(rng).build(wtxn)?
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
.cancel(cancel)
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
} else if writer.is_empty(wtxn)? {
continue;
}
} else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
// If we are quantizing the databases, we can't know from meilisearch
// if the db was empty but still contained the wrong metadata, thus we need
// to quantize everything and can't stop early. Since this operation can
// only happens once in the life of an embedder, it's not very performances
// sensitive.
if quantizing && !self.quantized {
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
writer
.builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step))
let writer = writer.prepare_changing_distance::<Hamming>(wtxn)?;
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
.cancel(cancel)
.build(wtxn)?;
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
} else if writer.need_build(wtxn)? {
writer
.builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step))
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
.cancel(cancel)
.build(wtxn)?;
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
} else if writer.is_empty(wtxn)? {
continue;
}
@@ -188,18 +306,18 @@ impl ArroyWrapper {
pub fn add_items(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
item_id: hannoy::ItemId,
embeddings: &Embeddings<f32>,
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
let dimension = embeddings.dimension();
for (index, vector) in
arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
{
if self.quantized {
arroy::Writer::new(self.quantized_db(), index, dimension)
hannoy::Writer::new(self.quantized_db(), index, dimension)
.add_item(wtxn, item_id, vector)?
} else {
arroy::Writer::new(self.angular_db(), index, dimension)
hannoy::Writer::new(self.angular_db(), index, dimension)
.add_item(wtxn, item_id, vector)?
}
}
@@ -210,9 +328,9 @@ impl ArroyWrapper {
pub fn add_item(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
item_id: hannoy::ItemId,
vector: &[f32],
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
if self.quantized {
self._add_item(wtxn, self.quantized_db(), item_id, vector)
} else {
@@ -220,17 +338,17 @@ impl ArroyWrapper {
}
}
fn _add_item<D: arroy::Distance>(
fn _add_item<D: hannoy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
db: hannoy::Database<D>,
item_id: hannoy::ItemId,
vector: &[f32],
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
let dimension = vector.len();
for index in arroy_store_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension);
for index in vector_store_range_for_embedder(self.embedder_index) {
let writer = hannoy::Writer::new(db, index, dimension);
if !writer.contains_item(wtxn, item_id)? {
writer.add_item(wtxn, item_id, vector)?;
break;
@@ -245,10 +363,10 @@ impl ArroyWrapper {
pub fn add_item_in_store(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
item_id: hannoy::ItemId,
store_id: u8,
vector: &[f32],
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
if self.quantized {
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
} else {
@@ -256,18 +374,18 @@ impl ArroyWrapper {
}
}
fn _add_item_in_store<D: arroy::Distance>(
fn _add_item_in_store<D: hannoy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
db: hannoy::Database<D>,
item_id: hannoy::ItemId,
store_id: u8,
vector: &[f32],
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
let dimension = vector.len();
let index = arroy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimension);
let index = vector_store_for_embedder(self.embedder_index, store_id);
let writer = hannoy::Writer::new(db, index, dimension);
writer.add_item(wtxn, item_id, vector)
}
@@ -276,14 +394,14 @@ impl ArroyWrapper {
&self,
wtxn: &mut RwTxn,
dimension: usize,
item_id: arroy::ItemId,
) -> Result<(), arroy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) {
item_id: hannoy::ItemId,
) -> Result<(), hannoy::Error> {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
writer.del_item(wtxn, item_id)?;
} else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
writer.del_item(wtxn, item_id)?;
}
}
@@ -301,10 +419,10 @@ impl ArroyWrapper {
pub fn del_item_in_store(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
item_id: hannoy::ItemId,
store_id: u8,
dimensions: usize,
) -> Result<bool, arroy::Error> {
) -> Result<bool, hannoy::Error> {
if self.quantized {
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
} else {
@@ -312,16 +430,16 @@ impl ArroyWrapper {
}
}
fn _del_item_in_store<D: arroy::Distance>(
fn _del_item_in_store<D: hannoy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
db: hannoy::Database<D>,
item_id: hannoy::ItemId,
store_id: u8,
dimensions: usize,
) -> Result<bool, arroy::Error> {
let index = arroy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimensions);
) -> Result<bool, hannoy::Error> {
let index = vector_store_for_embedder(self.embedder_index, store_id);
let writer = hannoy::Writer::new(db, index, dimensions);
writer.del_item(wtxn, item_id)
}
@@ -335,7 +453,7 @@ impl ArroyWrapper {
wtxn: &mut RwTxn,
store_id: u8,
dimensions: usize,
) -> Result<(), arroy::Error> {
) -> Result<(), hannoy::Error> {
if self.quantized {
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
} else {
@@ -343,15 +461,15 @@ impl ArroyWrapper {
}
}
fn _clear_store<D: arroy::Distance>(
fn _clear_store<D: hannoy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
db: hannoy::Database<D>,
store_id: u8,
dimensions: usize,
) -> Result<(), arroy::Error> {
let index = arroy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimensions);
) -> Result<(), hannoy::Error> {
let index = vector_store_for_embedder(self.embedder_index, store_id);
let writer = hannoy::Writer::new(db, index, dimensions);
writer.clear(wtxn)
}
@@ -359,9 +477,9 @@ impl ArroyWrapper {
pub fn del_item(
&self,
wtxn: &mut RwTxn,
item_id: arroy::ItemId,
item_id: hannoy::ItemId,
vector: &[f32],
) -> Result<bool, arroy::Error> {
) -> Result<bool, hannoy::Error> {
if self.quantized {
self._del_item(wtxn, self.quantized_db(), item_id, vector)
} else {
@@ -369,37 +487,34 @@ impl ArroyWrapper {
}
}
fn _del_item<D: arroy::Distance>(
fn _del_item<D: hannoy::Distance>(
&self,
wtxn: &mut RwTxn,
db: arroy::Database<D>,
item_id: arroy::ItemId,
db: hannoy::Database<D>,
item_id: hannoy::ItemId,
vector: &[f32],
) -> Result<bool, arroy::Error> {
) -> Result<bool, hannoy::Error> {
let dimension = vector.len();
for index in arroy_store_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension);
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
continue;
};
if candidate == vector {
for index in vector_store_range_for_embedder(self.embedder_index) {
let writer = hannoy::Writer::new(db, index, dimension);
if writer.contains_item(wtxn, item_id)? {
return writer.del_item(wtxn, item_id);
}
}
Ok(false)
}
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) {
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(wtxn)? {
continue;
}
writer.clear(wtxn)?;
} else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
if writer.is_empty(wtxn)? {
continue;
}
@@ -413,17 +528,31 @@ impl ArroyWrapper {
&self,
rtxn: &RoTxn,
dimension: usize,
item: arroy::ItemId,
) -> Result<bool, arroy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) {
let contains = if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
item: hannoy::ItemId,
) -> crate::Result<bool> {
for index in vector_store_range_for_embedder(self.embedder_index) {
let contains = if self.version_uses_arroy() {
if self.quantized {
let writer = arroy::Writer::new(self.arroy_quantized_db(), index, dimension);
if writer.is_empty(rtxn)? {
continue;
}
writer.contains_item(rtxn, item)?
} else {
let writer = arroy::Writer::new(self.arroy_angular_db(), index, dimension);
if writer.is_empty(rtxn)? {
continue;
}
writer.contains_item(rtxn, item)?
}
} else if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(rtxn)? {
continue;
}
writer.contains_item(rtxn, item)?
} else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
if writer.is_empty(rtxn)? {
continue;
}
@@ -442,15 +571,23 @@ impl ArroyWrapper {
item: ItemId,
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
if self.quantized {
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter)
) -> crate::Result<Vec<(ItemId, f32)>> {
if self.version_uses_arroy() {
if self.quantized {
self._arroy_nns_by_item(rtxn, self.arroy_quantized_db(), item, limit, filter)
.map_err(Into::into)
} else {
self._arroy_nns_by_item(rtxn, self.arroy_angular_db(), item, limit, filter)
.map_err(Into::into)
}
} else if self.quantized {
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter).map_err(Into::into)
} else {
self._nns_by_item(rtxn, self.angular_db(), item, limit, filter)
self._nns_by_item(rtxn, self.angular_db(), item, limit, filter).map_err(Into::into)
}
}
fn _nns_by_item<D: arroy::Distance>(
fn _arroy_nns_by_item<D: arroy::Distance>(
&self,
rtxn: &RoTxn,
db: arroy::Database<D>,
@@ -460,7 +597,7 @@ impl ArroyWrapper {
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
for reader in self.arroy_readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
if let Some(filter) = filter {
@@ -478,21 +615,56 @@ impl ArroyWrapper {
Ok(results)
}
fn _nns_by_item<D: hannoy::Distance>(
&self,
rtxn: &RoTxn,
db: hannoy::Database<D>,
item: ItemId,
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
searcher.ef_search((limit * 10).max(100)); // TODO find better ef
if let Some(filter) = filter {
searcher.candidates(filter);
}
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
results.append(&mut ret);
}
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
Ok(results)
}
pub fn nns_by_vector(
&self,
rtxn: &RoTxn,
vector: &[f32],
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
if self.quantized {
) -> crate::Result<Vec<(ItemId, f32)>> {
if self.version_uses_arroy() {
if self.quantized {
self._arroy_nns_by_vector(rtxn, self.arroy_quantized_db(), vector, limit, filter)
.map_err(Into::into)
} else {
self._arroy_nns_by_vector(rtxn, self.arroy_angular_db(), vector, limit, filter)
.map_err(Into::into)
}
} else if self.quantized {
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter)
.map_err(Into::into)
} else {
self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter)
self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter).map_err(Into::into)
}
}
fn _nns_by_vector<D: arroy::Distance>(
fn _arroy_nns_by_vector<D: arroy::Distance>(
&self,
rtxn: &RoTxn,
db: arroy::Database<D>,
@@ -502,7 +674,7 @@ impl ArroyWrapper {
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
for reader in self.arroy_readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
if let Some(filter) = filter {
@@ -520,10 +692,50 @@ impl ArroyWrapper {
Ok(results)
}
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, arroy::Error> {
fn _nns_by_vector<D: hannoy::Distance>(
&self,
rtxn: &RoTxn,
db: hannoy::Database<D>,
vector: &[f32],
limit: usize,
filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
let mut results = Vec::new();
for reader in self.readers(rtxn, db) {
let reader = reader?;
let mut searcher = reader.nns(limit);
searcher.ef_search((limit * 10).max(100)); // TODO find better ef
if let Some(filter) = filter {
searcher.candidates(filter);
}
results.append(&mut searcher.by_vector(rtxn, vector)?);
}
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
Ok(results)
}
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> crate::Result<Vec<Vec<f32>>> {
let mut vectors = Vec::new();
if self.quantized {
if self.version_uses_arroy() {
if self.quantized {
for reader in self.arroy_readers(rtxn, self.arroy_quantized_db()) {
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
vectors.push(vec);
}
}
} else {
for reader in self.arroy_readers(rtxn, self.arroy_angular_db()) {
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
vectors.push(vec);
}
}
}
} else if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) {
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
vectors.push(vec);
@@ -536,22 +748,31 @@ impl ArroyWrapper {
}
}
}
Ok(vectors)
}
fn angular_db(&self) -> arroy::Database<Cosine> {
fn arroy_angular_db(&self) -> arroy::Database<arroy::distances::Cosine> {
self.database.remap_types()
}
fn arroy_quantized_db(&self) -> arroy::Database<arroy::distances::BinaryQuantizedCosine> {
self.database.remap_types()
}
fn angular_db(&self) -> hannoy::Database<Cosine> {
self.database.remap_data_type()
}
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
fn quantized_db(&self) -> hannoy::Database<Hamming> {
self.database.remap_data_type()
}
pub fn aggregate_stats(
&self,
rtxn: &RoTxn,
stats: &mut ArroyStats,
) -> Result<(), arroy::Error> {
stats: &mut HannoyStats,
) -> Result<(), hannoy::Error> {
if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?;
@@ -573,10 +794,11 @@ impl ArroyWrapper {
}
#[derive(Debug, Default, Clone)]
pub struct ArroyStats {
pub struct HannoyStats {
pub number_of_embeddings: u64,
pub documents: RoaringBitmap,
}
/// One or multiple embeddings stored consecutively in a flat vector.
#[derive(Debug, PartialEq)]
pub struct Embeddings<F> {
@@ -1221,11 +1443,11 @@ pub const fn is_cuda_enabled() -> bool {
cfg!(feature = "cuda")
}
fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
(0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id))
fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
(0..=u8::MAX).map(move |store_id| vector_store_for_embedder(embedder_id, store_id))
}
fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
fn vector_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
let embedder_id = (embedder_id as u16) << 8;
embedder_id | (store_id as u16)
}

View File

@@ -321,7 +321,14 @@ impl Embedder {
pub fn prompt_count_in_chunk_hint(&self) -> usize {
match self.data.request.input_type() {
InputType::Text => 1,
InputType::TextArray => 10,
InputType::TextArray => {
let chunk_size = std::env::var("MEILI_EMBEDDINGS_CHUNK_SIZE")
.ok()
.and_then(|chunk_size| chunk_size.parse().ok())
.unwrap_or(10);
assert!(chunk_size <= 100, "Embedding chunk size cannot exceed 100");
chunk_size
}
}
}

View File

@@ -59,6 +59,7 @@ fn test_facet_distribution_with_no_facet_values() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -97,6 +97,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -329,6 +329,7 @@ fn criteria_ascdesc() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -138,6 +138,7 @@ fn test_typo_disabled_on_word() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
None,
)
.unwrap();

View File

@@ -59,7 +59,7 @@ fn fibo_recursive(n: u32) -> u32 {
if n == 1 {
return 2;
}
return fibo_recursive(n - 1) - fibo_recursive(n - 2);
fibo_recursive(n - 1) - fibo_recursive(n - 2)
}
use tracing_error::ExtractSpanTrace as _;