mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-26 07:40:31 +00:00
Compare commits
65 Commits
adapt-pref
...
prototype-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
60bfd3aef1 | ||
|
|
5027eea1a8 | ||
|
|
5079fb4b14 | ||
|
|
8e016fbfeb | ||
|
|
1ccde9bf0b | ||
|
|
34e814f400 | ||
|
|
a6fa0b97ec | ||
|
|
552127021f | ||
|
|
38abfec611 | ||
|
|
84a5c304fc | ||
|
|
e93d36d5b9 | ||
|
|
95f8e21533 | ||
|
|
b4d7d80ad9 | ||
|
|
68f197624e | ||
|
|
b79b03d4e2 | ||
|
|
86270e6878 | ||
|
|
81b6128b29 | ||
|
|
5f5a486895 | ||
|
|
5f4fc6c955 | ||
|
|
1f5e8fc072 | ||
|
|
3f3462ab62 | ||
|
|
93363b0201 | ||
|
|
97bb1ff9e2 | ||
|
|
5ee1378856 | ||
|
|
e27b850b09 | ||
|
|
f75f22e026 | ||
|
|
6203f4acef | ||
|
|
12edc2c20a | ||
|
|
94b9f3b310 | ||
|
|
5204c0b60b | ||
|
|
e73cd692db | ||
|
|
29b453346b | ||
|
|
c4bb435374 | ||
|
|
da99a04eb3 | ||
|
|
54ae6951eb | ||
|
|
2bcff2ea46 | ||
|
|
1275e72e0b | ||
|
|
658ec6e0a4 | ||
|
|
43e822e802 | ||
|
|
ee54d3171e | ||
|
|
a0e713c4e7 | ||
|
|
d4cb0a885b | ||
|
|
f52dee2b3b | ||
|
|
0bf879fb88 | ||
|
|
6ff81de401 | ||
|
|
2e4c9651df | ||
|
|
ec9649c922 | ||
|
|
9123370e90 | ||
|
|
14b396d302 | ||
|
|
393216bf30 | ||
|
|
e249e4db7b | ||
|
|
de2ca7006e | ||
|
|
333ce12eb2 | ||
|
|
fa2b96b9a5 | ||
|
|
19736cefe8 | ||
|
|
4fb25b8782 | ||
|
|
c83a33017e | ||
|
|
be72326c0a | ||
|
|
547379abb0 | ||
|
|
0b2fff27f2 | ||
|
|
3adbc2b942 | ||
|
|
fbea721378 | ||
|
|
391eb72137 | ||
|
|
d78ad51082 | ||
|
|
1956045a06 |
38
.github/workflows/sdks-tests.yml
vendored
38
.github/workflows/sdks-tests.yml
vendored
@@ -22,7 +22,7 @@ jobs:
|
||||
outputs:
|
||||
docker-image: ${{ steps.define-image.outputs.docker-image }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
- name: Define the Docker image we need to use
|
||||
id: define-image
|
||||
run: |
|
||||
@@ -46,11 +46,11 @@ jobs:
|
||||
MEILISEARCH_VERSION: ${{ needs.define-docker-image.outputs.docker-image }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-dotnet
|
||||
- name: Setup .NET Core
|
||||
uses: actions/setup-dotnet@v3
|
||||
uses: actions/setup-dotnet@v4
|
||||
with:
|
||||
dotnet-version: "6.0.x"
|
||||
- name: Install dependencies
|
||||
@@ -75,12 +75,12 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-dart
|
||||
- uses: dart-lang/setup-dart@v1
|
||||
with:
|
||||
sdk: 3.1.1
|
||||
sdk: 'latest'
|
||||
- name: Install dependencies
|
||||
run: dart pub get
|
||||
- name: Run integration tests
|
||||
@@ -100,10 +100,10 @@ jobs:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: stable
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-go
|
||||
- name: Get dependencies
|
||||
@@ -129,11 +129,11 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-java
|
||||
- name: Set up Java
|
||||
uses: actions/setup-java@v3
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
java-version: 8
|
||||
distribution: 'zulu'
|
||||
@@ -156,7 +156,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js
|
||||
- name: Setup node
|
||||
@@ -191,7 +191,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-php
|
||||
- name: Install PHP
|
||||
@@ -220,11 +220,11 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-python
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
- name: Install pipenv
|
||||
uses: dschep/install-pipenv-action@v1
|
||||
- name: Install dependencies
|
||||
@@ -245,7 +245,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-ruby
|
||||
- name: Set up Ruby 3
|
||||
@@ -270,7 +270,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-rust
|
||||
- name: Build
|
||||
@@ -291,7 +291,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-swift
|
||||
- name: Run tests
|
||||
@@ -314,7 +314,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js-plugins
|
||||
- name: Setup node
|
||||
@@ -345,7 +345,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-rails
|
||||
- name: Set up Ruby 3
|
||||
@@ -369,7 +369,7 @@ jobs:
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
repository: meilisearch/meilisearch-symfony
|
||||
- name: Install PHP
|
||||
|
||||
26
Cargo.lock
generated
26
Cargo.lock
generated
@@ -383,7 +383,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "arroy"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/meilisearch/arroy.git#4f193fd534acd357b65bfe9eec4b3fed8ece2007"
|
||||
source = "git+https://github.com/meilisearch/arroy.git#d372648212e561a4845077cdb9239423d78655a2"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
@@ -1677,9 +1677,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.26"
|
||||
version = "1.0.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
|
||||
checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
@@ -1701,9 +1701,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.0"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652"
|
||||
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
|
||||
dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
@@ -2753,9 +2753,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.4.0"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c"
|
||||
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
@@ -2774,6 +2774,7 @@ dependencies = [
|
||||
"dump",
|
||||
"enum-iterator",
|
||||
"file-store",
|
||||
"flate2",
|
||||
"insta",
|
||||
"log",
|
||||
"meili-snap",
|
||||
@@ -2789,6 +2790,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"time",
|
||||
"ureq",
|
||||
"uuid 1.5.0",
|
||||
]
|
||||
|
||||
@@ -3559,6 +3561,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"toml",
|
||||
"url",
|
||||
"urlencoding",
|
||||
"uuid 1.5.0",
|
||||
"vergen",
|
||||
@@ -4067,9 +4070,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.0"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "permissive-json-pointer"
|
||||
@@ -5597,13 +5600,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.4.0"
|
||||
version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb"
|
||||
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019-2022 Meili SAS
|
||||
Copyright (c) 2019-2024 Meili SAS
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -42,7 +42,7 @@ Meilisearch helps you shape a delightful search experience in a snap, offering f
|
||||
|
||||
- **Search-as-you-type:** find search results in less than 50 milliseconds
|
||||
- **[Typo tolerance](https://www.meilisearch.com/docs/learn/getting_started/customizing_relevancy?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features#typo-tolerance):** get relevant matches even when queries contain typos and misspellings
|
||||
- **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your user's search experience with custom filters and build a faceted search interface in a few lines of code
|
||||
- **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your users' search experience with custom filters and build a faceted search interface in a few lines of code
|
||||
- **[Sorting](https://www.meilisearch.com/docs/learn/fine_tuning_results/sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** sort results based on price, date, or pretty much anything else your users need
|
||||
- **[Synonym support](https://www.meilisearch.com/docs/learn/getting_started/customizing_relevancy?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features#synonyms):** configure synonyms to include more relevant content in your search results
|
||||
- **[Geosearch](https://www.meilisearch.com/docs/learn/fine_tuning_results/geosearch?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** filter and sort documents based on geographic data
|
||||
|
||||
@@ -18,6 +18,7 @@ derive_builder = "0.12.0"
|
||||
dump = { path = "../dump" }
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.0.28"
|
||||
log = "0.4.17"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
@@ -30,6 +31,7 @@ synchronoise = "1.0.1"
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
ureq = "2.9.1"
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -60,7 +60,7 @@ pub(crate) enum Batch {
|
||||
/// The list of tasks that were processing when this task cancelation appeared.
|
||||
previous_processing_tasks: RoaringBitmap,
|
||||
},
|
||||
TaskDeletion(Task),
|
||||
TaskDeletions(Vec<Task>),
|
||||
SnapshotCreation(Vec<Task>),
|
||||
Dump(Task),
|
||||
IndexOperation {
|
||||
@@ -146,13 +146,12 @@ impl Batch {
|
||||
pub fn ids(&self) -> Vec<TaskId> {
|
||||
match self {
|
||||
Batch::TaskCancelation { task, .. }
|
||||
| Batch::TaskDeletion(task)
|
||||
| Batch::Dump(task)
|
||||
| Batch::IndexCreation { task, .. }
|
||||
| Batch::IndexUpdate { task, .. } => vec![task.uid],
|
||||
Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
}
|
||||
Batch::SnapshotCreation(tasks)
|
||||
| Batch::TaskDeletions(tasks)
|
||||
| Batch::IndexDeletion { tasks, .. } => tasks.iter().map(|task| task.uid).collect(),
|
||||
Batch::IndexOperation { op, .. } => match op {
|
||||
IndexOperation::DocumentOperation { tasks, .. }
|
||||
| IndexOperation::Settings { tasks, .. }
|
||||
@@ -180,7 +179,7 @@ impl Batch {
|
||||
use Batch::*;
|
||||
match self {
|
||||
TaskCancelation { .. }
|
||||
| TaskDeletion(_)
|
||||
| TaskDeletions(_)
|
||||
| SnapshotCreation(_)
|
||||
| Dump(_)
|
||||
| IndexSwap { .. } => None,
|
||||
@@ -199,7 +198,7 @@ impl fmt::Display for Batch {
|
||||
let tasks = self.ids();
|
||||
match self {
|
||||
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
|
||||
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
|
||||
Batch::TaskDeletions(_) => f.write_str("TaskDeletion")?,
|
||||
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
|
||||
Batch::Dump(_) => f.write_str("Dump")?,
|
||||
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
|
||||
@@ -539,9 +538,9 @@ impl IndexScheduler {
|
||||
|
||||
// 2. we get the next task to delete
|
||||
let to_delete = self.get_kind(rtxn, Kind::TaskDeletion)? & enqueued;
|
||||
if let Some(task_id) = to_delete.min() {
|
||||
let task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
|
||||
return Ok(Some(Batch::TaskDeletion(task)));
|
||||
if !to_delete.is_empty() {
|
||||
let tasks = self.get_existing_tasks(rtxn, to_delete)?;
|
||||
return Ok(Some(Batch::TaskDeletions(tasks)));
|
||||
}
|
||||
|
||||
// 3. we batch the snapshot.
|
||||
@@ -681,31 +680,43 @@ impl IndexScheduler {
|
||||
|
||||
Ok(vec![task])
|
||||
}
|
||||
Batch::TaskDeletion(mut task) => {
|
||||
Batch::TaskDeletions(mut tasks) => {
|
||||
// 1. Retrieve the tasks that matched the query at enqueue-time.
|
||||
let matched_tasks =
|
||||
let mut matched_tasks = RoaringBitmap::new();
|
||||
|
||||
for task in tasks.iter() {
|
||||
if let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind {
|
||||
tasks
|
||||
matched_tasks |= tasks;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
let mut deleted_tasks = self.delete_matched_tasks(&mut wtxn, &matched_tasks)?;
|
||||
wtxn.commit()?;
|
||||
|
||||
for task in tasks.iter_mut() {
|
||||
task.status = Status::Succeeded;
|
||||
let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind else {
|
||||
unreachable!()
|
||||
};
|
||||
|
||||
let mut wtxn = self.env.write_txn()?;
|
||||
let deleted_tasks_count = self.delete_matched_tasks(&mut wtxn, matched_tasks)?;
|
||||
let deleted_tasks_count = deleted_tasks.intersection_len(tasks);
|
||||
deleted_tasks -= tasks;
|
||||
|
||||
task.status = Status::Succeeded;
|
||||
match &mut task.details {
|
||||
Some(Details::TaskDeletion {
|
||||
matched_tasks: _,
|
||||
deleted_tasks,
|
||||
original_filter: _,
|
||||
}) => {
|
||||
*deleted_tasks = Some(deleted_tasks_count);
|
||||
match &mut task.details {
|
||||
Some(Details::TaskDeletion {
|
||||
matched_tasks: _,
|
||||
deleted_tasks,
|
||||
original_filter: _,
|
||||
}) => {
|
||||
*deleted_tasks = Some(deleted_tasks_count);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
wtxn.commit()?;
|
||||
Ok(vec![task])
|
||||
Ok(tasks)
|
||||
}
|
||||
Batch::SnapshotCreation(mut tasks) => {
|
||||
fs::create_dir_all(&self.snapshots_path)?;
|
||||
@@ -936,8 +947,8 @@ impl IndexScheduler {
|
||||
};
|
||||
|
||||
// the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick
|
||||
*self.currently_updating_index.write().unwrap() =
|
||||
Some((index_uid.clone(), index.clone()));
|
||||
self.index_mapper
|
||||
.set_currently_updating_index(Some((index_uid.clone(), index.clone())));
|
||||
|
||||
let mut index_wtxn = index.write_txn()?;
|
||||
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
|
||||
@@ -1351,9 +1362,6 @@ impl IndexScheduler {
|
||||
|
||||
for (task, (_, settings)) in tasks.iter_mut().zip(settings) {
|
||||
let checked_settings = settings.clone().check();
|
||||
if matches!(checked_settings.embedders, milli::update::Setting::Set(_)) {
|
||||
self.features().check_vector("Passing `embedders` in settings")?
|
||||
}
|
||||
task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) });
|
||||
apply_settings_to_builder(&checked_settings, &mut builder);
|
||||
|
||||
@@ -1438,7 +1446,11 @@ impl IndexScheduler {
|
||||
/// Delete each given task from all the databases (if it is deleteable).
|
||||
///
|
||||
/// Return the number of tasks that were actually deleted.
|
||||
fn delete_matched_tasks(&self, wtxn: &mut RwTxn, matched_tasks: &RoaringBitmap) -> Result<u64> {
|
||||
fn delete_matched_tasks(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
matched_tasks: &RoaringBitmap,
|
||||
) -> Result<RoaringBitmap> {
|
||||
// 1. Remove from this list the tasks that we are not allowed to delete
|
||||
let enqueued_tasks = self.get_status(wtxn, Status::Enqueued)?;
|
||||
let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone();
|
||||
@@ -1503,7 +1515,7 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(to_delete_tasks.len())
|
||||
Ok(to_delete_tasks)
|
||||
}
|
||||
|
||||
/// Cancel each given task from all the databases (if it is cancelable).
|
||||
|
||||
@@ -69,6 +69,10 @@ pub struct IndexMapper {
|
||||
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
|
||||
enable_mdb_writemap: bool,
|
||||
pub indexer_config: Arc<IndexerConfig>,
|
||||
|
||||
/// A few types of long running batches of tasks that act on a single index set this field
|
||||
/// so that a handle to the index is available from other threads (search) in an optimized manner.
|
||||
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
|
||||
}
|
||||
|
||||
/// Whether the index is available for use or is forbidden to be inserted back in the index map
|
||||
@@ -151,6 +155,7 @@ impl IndexMapper {
|
||||
index_growth_amount,
|
||||
enable_mdb_writemap,
|
||||
indexer_config: Arc::new(indexer_config),
|
||||
currently_updating_index: Default::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -303,6 +308,14 @@ impl IndexMapper {
|
||||
|
||||
/// Return an index, may open it if it wasn't already opened.
|
||||
pub fn index(&self, rtxn: &RoTxn, name: &str) -> Result<Index> {
|
||||
if let Some((current_name, current_index)) =
|
||||
self.currently_updating_index.read().unwrap().as_ref()
|
||||
{
|
||||
if current_name == name {
|
||||
return Ok(current_index.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let uuid = self
|
||||
.index_mapping
|
||||
.get(rtxn, name)?
|
||||
@@ -474,4 +487,8 @@ impl IndexMapper {
|
||||
pub fn indexer_config(&self) -> &IndexerConfig {
|
||||
&self.indexer_config
|
||||
}
|
||||
|
||||
pub fn set_currently_updating_index(&self, index: Option<(String, Index)>) {
|
||||
*self.currently_updating_index.write().unwrap() = index;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,10 +37,11 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
snapshots_path: _,
|
||||
auth_path: _,
|
||||
version_file_path: _,
|
||||
webhook_url: _,
|
||||
webhook_authorization_header: _,
|
||||
test_breakpoint_sdr: _,
|
||||
planned_failures: _,
|
||||
run_loop_iteration: _,
|
||||
currently_updating_index: _,
|
||||
embedders: _,
|
||||
} = scheduler;
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ pub type TaskId = u32;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, Read};
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@@ -45,6 +46,8 @@ use dump::{KindDump, TaskDump, UpdateFile};
|
||||
pub use error::Error;
|
||||
pub use features::RoFeatures;
|
||||
use file_store::FileStore;
|
||||
use flate2::bufread::GzEncoder;
|
||||
use flate2::Compression;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::heed::byteorder::BE;
|
||||
@@ -54,6 +57,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
||||
use meilisearch_types::milli::update::IndexerConfig;
|
||||
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
|
||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::task_view::TaskView;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||
use puffin::FrameView;
|
||||
use roaring::RoaringBitmap;
|
||||
@@ -170,8 +174,8 @@ impl ProcessingTasks {
|
||||
}
|
||||
|
||||
/// Set the processing tasks to an empty list
|
||||
fn stop_processing(&mut self) {
|
||||
self.processing = RoaringBitmap::new();
|
||||
fn stop_processing(&mut self) -> RoaringBitmap {
|
||||
std::mem::take(&mut self.processing)
|
||||
}
|
||||
|
||||
/// Returns `true` if there, at least, is one task that is currently processing that we must stop.
|
||||
@@ -241,6 +245,10 @@ pub struct IndexSchedulerOptions {
|
||||
pub snapshots_path: PathBuf,
|
||||
/// The path to the folder containing the dumps.
|
||||
pub dumps_path: PathBuf,
|
||||
/// The URL on which we must send the tasks statuses
|
||||
pub webhook_url: Option<String>,
|
||||
/// The value we will send into the Authorization HTTP header on the webhook URL
|
||||
pub webhook_authorization_header: Option<String>,
|
||||
/// The maximum size, in bytes, of the task index.
|
||||
pub task_db_size: usize,
|
||||
/// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index.
|
||||
@@ -323,6 +331,11 @@ pub struct IndexScheduler {
|
||||
/// The maximum number of tasks that will be batched together.
|
||||
pub(crate) max_number_of_batched_tasks: usize,
|
||||
|
||||
/// The webhook url we should send tasks to after processing every batches.
|
||||
pub(crate) webhook_url: Option<String>,
|
||||
/// The Authorization header to send to the webhook URL.
|
||||
pub(crate) webhook_authorization_header: Option<String>,
|
||||
|
||||
/// A frame to output the indexation profiling files to disk.
|
||||
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
|
||||
|
||||
@@ -338,10 +351,6 @@ pub struct IndexScheduler {
|
||||
/// The path to the version file of Meilisearch.
|
||||
pub(crate) version_file_path: PathBuf,
|
||||
|
||||
/// A few types of long running batches of tasks that act on a single index set this field
|
||||
/// so that a handle to the index is available from other threads (search) in an optimized manner.
|
||||
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
|
||||
|
||||
embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,
|
||||
|
||||
// ================= test
|
||||
@@ -388,7 +397,8 @@ impl IndexScheduler {
|
||||
dumps_path: self.dumps_path.clone(),
|
||||
auth_path: self.auth_path.clone(),
|
||||
version_file_path: self.version_file_path.clone(),
|
||||
currently_updating_index: self.currently_updating_index.clone(),
|
||||
webhook_url: self.webhook_url.clone(),
|
||||
webhook_authorization_header: self.webhook_authorization_header.clone(),
|
||||
embedders: self.embedders.clone(),
|
||||
#[cfg(test)]
|
||||
test_breakpoint_sdr: self.test_breakpoint_sdr.clone(),
|
||||
@@ -487,7 +497,8 @@ impl IndexScheduler {
|
||||
snapshots_path: options.snapshots_path,
|
||||
auth_path: options.auth_path,
|
||||
version_file_path: options.version_file_path,
|
||||
currently_updating_index: Arc::new(RwLock::new(None)),
|
||||
webhook_url: options.webhook_url,
|
||||
webhook_authorization_header: options.webhook_authorization_header,
|
||||
embedders: Default::default(),
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -671,13 +682,6 @@ impl IndexScheduler {
|
||||
/// If you need to fetch information from or perform an action on all indexes,
|
||||
/// see the `try_for_each_index` function.
|
||||
pub fn index(&self, name: &str) -> Result<Index> {
|
||||
if let Some((current_name, current_index)) =
|
||||
self.currently_updating_index.read().unwrap().as_ref()
|
||||
{
|
||||
if current_name == name {
|
||||
return Ok(current_index.clone());
|
||||
}
|
||||
}
|
||||
let rtxn = self.env.read_txn()?;
|
||||
self.index_mapper.index(&rtxn, name)
|
||||
}
|
||||
@@ -1158,7 +1162,7 @@ impl IndexScheduler {
|
||||
};
|
||||
|
||||
// Reset the currently updating index to relinquish the index handle
|
||||
*self.currently_updating_index.write().unwrap() = None;
|
||||
self.index_mapper.set_currently_updating_index(None);
|
||||
|
||||
#[cfg(test)]
|
||||
self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?;
|
||||
@@ -1251,19 +1255,99 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
self.processing_tasks.write().unwrap().stop_processing();
|
||||
let processed = self.processing_tasks.write().unwrap().stop_processing();
|
||||
|
||||
#[cfg(test)]
|
||||
self.maybe_fail(tests::FailureLocation::CommittingWtxn)?;
|
||||
|
||||
wtxn.commit().map_err(Error::HeedTransaction)?;
|
||||
|
||||
// We shouldn't crash the tick function if we can't send data to the webhook.
|
||||
let _ = self.notify_webhook(&processed);
|
||||
|
||||
#[cfg(test)]
|
||||
self.breakpoint(Breakpoint::AfterProcessing);
|
||||
|
||||
Ok(TickOutcome::TickAgain(processed_tasks))
|
||||
}
|
||||
|
||||
/// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one.
|
||||
fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> {
|
||||
if let Some(ref url) = self.webhook_url {
|
||||
struct TaskReader<'a, 'b> {
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
index_scheduler: &'a IndexScheduler,
|
||||
tasks: &'b mut roaring::bitmap::Iter<'b>,
|
||||
buffer: Vec<u8>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'b> Read for TaskReader<'a, 'b> {
|
||||
fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
if self.buffer.is_empty() {
|
||||
match self.tasks.next() {
|
||||
None => return Ok(0),
|
||||
Some(task_id) => {
|
||||
let task = self
|
||||
.index_scheduler
|
||||
.get_task(self.rtxn, task_id)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?
|
||||
.ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
Error::CorruptedTaskQueue,
|
||||
)
|
||||
})?;
|
||||
|
||||
serde_json::to_writer(
|
||||
&mut self.buffer,
|
||||
&TaskView::from_task(&task),
|
||||
)?;
|
||||
self.buffer.push(b'\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut to_write = &self.buffer[self.written..];
|
||||
let wrote = io::copy(&mut to_write, &mut buf)?;
|
||||
self.written += wrote as usize;
|
||||
|
||||
// we wrote everything and must refresh our buffer on the next call
|
||||
if self.written == self.buffer.len() {
|
||||
self.written = 0;
|
||||
self.buffer.clear();
|
||||
}
|
||||
|
||||
Ok(wrote as usize)
|
||||
}
|
||||
}
|
||||
|
||||
let rtxn = self.env.read_txn()?;
|
||||
|
||||
let task_reader = TaskReader {
|
||||
rtxn: &rtxn,
|
||||
index_scheduler: self,
|
||||
tasks: &mut updated.into_iter(),
|
||||
buffer: Vec::with_capacity(50), // on average a task is around ~100 bytes
|
||||
written: 0,
|
||||
};
|
||||
|
||||
// let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
|
||||
let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
|
||||
let request = ureq::post(url).set("Content-Encoding", "gzip");
|
||||
let request = match &self.webhook_authorization_header {
|
||||
Some(header) => request.set("Authorization", header),
|
||||
None => request,
|
||||
};
|
||||
|
||||
if let Err(e) = request.send(reader) {
|
||||
log::error!("While sending data to the webhook: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Register a task to cleanup the task queue if needed
|
||||
fn cleanup_task_queue(&self) -> Result<()> {
|
||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||
@@ -1677,6 +1761,8 @@ mod tests {
|
||||
indexes_path: tempdir.path().join("indexes"),
|
||||
snapshots_path: tempdir.path().join("snapshots"),
|
||||
dumps_path: tempdir.path().join("dumps"),
|
||||
webhook_url: None,
|
||||
webhook_authorization_header: None,
|
||||
task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
|
||||
index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
|
||||
enable_mdb_writemap: false,
|
||||
@@ -2158,10 +2244,7 @@ mod tests {
|
||||
.unwrap();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
}
|
||||
for _ in 0..2 {
|
||||
handle.advance_one_successful_batch();
|
||||
index_scheduler.assert_internally_consistent();
|
||||
}
|
||||
handle.advance_one_successful_batch();
|
||||
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processed");
|
||||
}
|
||||
|
||||
@@ -34,12 +34,10 @@ catto: { number_of_documents: 1, field_distribution: {"id": 1} }
|
||||
[timestamp] [3,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [2,]
|
||||
[timestamp] [3,]
|
||||
[timestamp] [2,3,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [2,]
|
||||
[timestamp] [3,]
|
||||
[timestamp] [2,3,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
00000000-0000-0000-0000-000000000001
|
||||
|
||||
@@ -344,7 +344,10 @@ impl ErrorCode for milli::Error {
|
||||
Code::InvalidDocumentId
|
||||
}
|
||||
UserError::MissingDocumentField(_) => Code::InvalidDocumentFields,
|
||||
UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::InvalidFieldForSource { .. }
|
||||
| UserError::MissingFieldForSource { .. }
|
||||
| UserError::InvalidOpenAiModel { .. }
|
||||
| UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders,
|
||||
UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound,
|
||||
|
||||
@@ -9,6 +9,7 @@ pub mod index_uid_pattern;
|
||||
pub mod keys;
|
||||
pub mod settings;
|
||||
pub mod star_or;
|
||||
pub mod task_view;
|
||||
pub mod tasks;
|
||||
pub mod versioning;
|
||||
pub use milli::{heed, Index};
|
||||
|
||||
@@ -318,6 +318,21 @@ impl Settings<Unchecked> {
|
||||
_kind: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate(self) -> Result<Self, milli::Error> {
|
||||
self.validate_embedding_settings()
|
||||
}
|
||||
|
||||
fn validate_embedding_settings(mut self) -> Result<Self, milli::Error> {
|
||||
let Setting::Set(mut configs) = self.embedders else { return Ok(self) };
|
||||
for (name, config) in configs.iter_mut() {
|
||||
let config_to_check = std::mem::take(config);
|
||||
let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?;
|
||||
*config = checked_config
|
||||
}
|
||||
self.embedders = Setting::Set(configs);
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -585,11 +600,12 @@ pub fn settings(
|
||||
),
|
||||
};
|
||||
|
||||
let embedders = index
|
||||
let embedders: BTreeMap<_, _> = index
|
||||
.embedding_configs(rtxn)?
|
||||
.into_iter()
|
||||
.map(|(name, config)| (name, Setting::Set(config.into())))
|
||||
.collect();
|
||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||
|
||||
Ok(Settings {
|
||||
displayed_attributes: match displayed_attributes {
|
||||
@@ -611,15 +627,12 @@ pub fn settings(
|
||||
Some(field) => Setting::Set(field),
|
||||
None => Setting::Reset,
|
||||
},
|
||||
proximity_precision: match proximity_precision {
|
||||
Some(precision) => Setting::Set(precision),
|
||||
None => Setting::Reset,
|
||||
},
|
||||
proximity_precision: Setting::Set(proximity_precision.unwrap_or_default()),
|
||||
synonyms: Setting::Set(synonyms),
|
||||
typo_tolerance: Setting::Set(typo_tolerance),
|
||||
faceting: Setting::Set(faceting),
|
||||
pagination: Setting::Set(pagination),
|
||||
embedders: Setting::Set(embedders),
|
||||
embedders,
|
||||
_kind: PhantomData,
|
||||
})
|
||||
}
|
||||
@@ -720,10 +733,11 @@ impl From<RankingRuleView> for Criterion {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)]
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(error = DeserrJsonError<InvalidSettingsProximityPrecision>, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum ProximityPrecisionView {
|
||||
#[default]
|
||||
ByWord,
|
||||
ByAttribute,
|
||||
}
|
||||
|
||||
139
meilisearch-types/src/task_view.rs
Normal file
139
meilisearch-types/src/task_view.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
use serde::Serialize;
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use crate::error::ResponseError;
|
||||
use crate::settings::{Settings, Unchecked};
|
||||
use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct TaskView {
|
||||
pub uid: TaskId,
|
||||
#[serde(default)]
|
||||
pub index_uid: Option<String>,
|
||||
pub status: Status,
|
||||
#[serde(rename = "type")]
|
||||
pub kind: Kind,
|
||||
pub canceled_by: Option<TaskId>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub details: Option<DetailsView>,
|
||||
pub error: Option<ResponseError>,
|
||||
#[serde(serialize_with = "serialize_duration", default)]
|
||||
pub duration: Option<Duration>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub enqueued_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub started_at: Option<OffsetDateTime>,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
|
||||
impl TaskView {
|
||||
pub fn from_task(task: &Task) -> TaskView {
|
||||
TaskView {
|
||||
uid: task.uid,
|
||||
index_uid: task.index_uid().map(ToOwned::to_owned),
|
||||
status: task.status,
|
||||
kind: task.kind.as_kind(),
|
||||
canceled_by: task.canceled_by,
|
||||
details: task.details.clone().map(DetailsView::from),
|
||||
error: task.error.clone(),
|
||||
duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start),
|
||||
enqueued_at: task.enqueued_at,
|
||||
started_at: task.started_at,
|
||||
finished_at: task.finished_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub received_documents: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub indexed_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary_key: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub provided_ids: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub matched_tasks: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub canceled_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub original_filter: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub dump_uid: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
pub settings: Option<Box<Settings<Unchecked>>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub swaps: Option<Vec<IndexSwap>>,
|
||||
}
|
||||
|
||||
impl From<Details> for DetailsView {
|
||||
fn from(details: Details) -> Self {
|
||||
match details {
|
||||
Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => {
|
||||
DetailsView {
|
||||
received_documents: Some(received_documents),
|
||||
indexed_documents: Some(indexed_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::SettingsUpdate { settings } => {
|
||||
DetailsView { settings: Some(settings), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexInfo { primary_key } => {
|
||||
DetailsView { primary_key: Some(primary_key), ..DetailsView::default() }
|
||||
}
|
||||
Details::DocumentDeletion {
|
||||
provided_ids: received_document_ids,
|
||||
deleted_documents,
|
||||
} => DetailsView {
|
||||
provided_ids: Some(received_document_ids),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
original_filter: Some(None),
|
||||
..DetailsView::default()
|
||||
},
|
||||
Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
|
||||
DetailsView {
|
||||
provided_ids: Some(0),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::ClearAll { deleted_documents } => {
|
||||
DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() }
|
||||
}
|
||||
Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
canceled_tasks: Some(canceled_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
deleted_tasks: Some(deleted_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::Dump { dump_uid } => {
|
||||
DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexSwap { swaps } => {
|
||||
DetailsView { swaps: Some(swaps), ..Default::default() }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -104,6 +104,7 @@ walkdir = "2.3.3"
|
||||
yaup = "0.2.1"
|
||||
serde_urlencoded = "0.7.1"
|
||||
termcolor = "1.2.0"
|
||||
url = { version = "2.5.0", features = ["serde"] }
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.8.0"
|
||||
@@ -153,5 +154,5 @@ greek = ["meilisearch-types/greek"]
|
||||
khmer = ["meilisearch-types/khmer"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"
|
||||
sha1 = "83cd44ed1e5f97ecb581dc9f958a63f4ccc982d9"
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.12/build.zip"
|
||||
sha1 = "acfe9a018c93eb0604ea87ee87bff7df5474e18e"
|
||||
|
||||
@@ -264,6 +264,8 @@ struct Infos {
|
||||
ignore_snapshot_if_db_exists: bool,
|
||||
http_addr: bool,
|
||||
http_payload_size_limit: Byte,
|
||||
task_queue_webhook: bool,
|
||||
task_webhook_authorization_header: bool,
|
||||
log_level: String,
|
||||
max_indexing_memory: MaxMemory,
|
||||
max_indexing_threads: MaxThreads,
|
||||
@@ -290,6 +292,8 @@ impl From<Opt> for Infos {
|
||||
http_addr,
|
||||
master_key: _,
|
||||
env,
|
||||
task_webhook_url,
|
||||
task_webhook_authorization_header,
|
||||
max_index_size: _,
|
||||
max_task_db_size: _,
|
||||
http_payload_size_limit,
|
||||
@@ -343,6 +347,8 @@ impl From<Opt> for Infos {
|
||||
http_addr: http_addr != default_http_addr(),
|
||||
http_payload_size_limit,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
task_queue_webhook: task_webhook_url.is_some(),
|
||||
task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
|
||||
log_level: log_level.to_string(),
|
||||
max_indexing_memory,
|
||||
max_indexing_threads,
|
||||
|
||||
@@ -228,6 +228,8 @@ fn open_or_create_database_unchecked(
|
||||
indexes_path: opt.db_path.join("indexes"),
|
||||
snapshots_path: opt.snapshot_dir.clone(),
|
||||
dumps_path: opt.dump_dir.clone(),
|
||||
webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()),
|
||||
webhook_authorization_header: opt.task_webhook_authorization_header.clone(),
|
||||
task_db_size: opt.max_task_db_size.get_bytes() as usize,
|
||||
index_base_map_size: opt.max_index_size.get_bytes() as usize,
|
||||
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
|
||||
|
||||
@@ -21,6 +21,7 @@ use rustls::RootCertStore;
|
||||
use rustls_pemfile::{certs, pkcs8_private_keys, rsa_private_keys};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sysinfo::{RefreshKind, System, SystemExt};
|
||||
use url::Url;
|
||||
|
||||
const POSSIBLE_ENV: [&str; 2] = ["development", "production"];
|
||||
|
||||
@@ -28,6 +29,8 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH";
|
||||
const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR";
|
||||
const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
|
||||
const MEILI_ENV: &str = "MEILI_ENV";
|
||||
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
|
||||
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
|
||||
#[cfg(feature = "analytics")]
|
||||
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
||||
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
||||
@@ -156,6 +159,14 @@ pub struct Opt {
|
||||
#[serde(default = "default_env")]
|
||||
pub env: String,
|
||||
|
||||
/// Called whenever a task finishes so a third party can be notified.
|
||||
#[clap(long, env = MEILI_TASK_WEBHOOK_URL)]
|
||||
pub task_webhook_url: Option<Url>,
|
||||
|
||||
/// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified.
|
||||
#[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)]
|
||||
pub task_webhook_authorization_header: Option<String>,
|
||||
|
||||
/// Deactivates Meilisearch's built-in telemetry when provided.
|
||||
///
|
||||
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
||||
@@ -375,6 +386,8 @@ impl Opt {
|
||||
http_addr,
|
||||
master_key,
|
||||
env,
|
||||
task_webhook_url,
|
||||
task_webhook_authorization_header,
|
||||
max_index_size: _,
|
||||
max_task_db_size: _,
|
||||
http_payload_size_limit,
|
||||
@@ -409,6 +422,16 @@ impl Opt {
|
||||
export_to_env_if_not_present(MEILI_MASTER_KEY, master_key);
|
||||
}
|
||||
export_to_env_if_not_present(MEILI_ENV, env);
|
||||
if let Some(task_webhook_url) = task_webhook_url {
|
||||
export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string());
|
||||
}
|
||||
if let Some(task_webhook_authorization_header) = task_webhook_authorization_header {
|
||||
export_to_env_if_not_present(
|
||||
MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER,
|
||||
task_webhook_authorization_header,
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "analytics")]
|
||||
{
|
||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||
|
||||
@@ -90,6 +90,11 @@ macro_rules! make_setting_route {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let new_settings = $crate::routes::indexes::settings::validate_settings(
|
||||
new_settings,
|
||||
&index_scheduler,
|
||||
)?;
|
||||
|
||||
let allow_index_creation =
|
||||
index_scheduler.filters().allow_index_creation(&index_uid);
|
||||
|
||||
@@ -453,7 +458,7 @@ make_setting_route!(
|
||||
json!({
|
||||
"proximity_precision": {
|
||||
"set": precision.is_some(),
|
||||
"value": precision,
|
||||
"value": precision.unwrap_or_default(),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
@@ -582,13 +587,13 @@ fn embedder_analytics(
|
||||
for source in s
|
||||
.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.embedder_options.set())
|
||||
.filter_map(|config| config.source.set())
|
||||
{
|
||||
use meilisearch_types::milli::vector::settings::EmbedderSettings;
|
||||
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
||||
match source {
|
||||
EmbedderSettings::OpenAi(_) => sources.insert("openAi"),
|
||||
EmbedderSettings::HuggingFace(_) => sources.insert("huggingFace"),
|
||||
EmbedderSettings::UserProvided(_) => sources.insert("userProvided"),
|
||||
EmbedderSource::OpenAi => sources.insert("openAi"),
|
||||
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
|
||||
EmbedderSource::UserProvided => sources.insert("userProvided"),
|
||||
};
|
||||
}
|
||||
};
|
||||
@@ -651,6 +656,7 @@ pub async fn update_all(
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let new_settings = body.into_inner();
|
||||
let new_settings = validate_settings(new_settings, &index_scheduler)?;
|
||||
|
||||
analytics.publish(
|
||||
"Settings Updated".to_string(),
|
||||
@@ -684,7 +690,8 @@ pub async fn update_all(
|
||||
"set": new_settings.distinct_attribute.as_ref().set().is_some()
|
||||
},
|
||||
"proximity_precision": {
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some()
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some(),
|
||||
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
|
||||
},
|
||||
"typo_tolerance": {
|
||||
"enabled": new_settings.typo_tolerance
|
||||
@@ -800,3 +807,13 @@ pub async fn delete_all(
|
||||
debug!("returns: {:?}", task);
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
fn validate_settings(
|
||||
settings: Settings<Unchecked>,
|
||||
index_scheduler: &IndexScheduler,
|
||||
) -> Result<Settings<Unchecked>, ResponseError> {
|
||||
if matches!(settings.embedders, Setting::Set(_)) {
|
||||
index_scheduler.features().check_vector("Passing `embedders` in settings")?
|
||||
}
|
||||
Ok(settings.validate()?)
|
||||
}
|
||||
|
||||
@@ -8,11 +8,9 @@ use meilisearch_types::deserr::DeserrQueryParamError;
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::error::{InvalidTaskDateError, ResponseError};
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::settings::{Settings, Unchecked};
|
||||
use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList};
|
||||
use meilisearch_types::tasks::{
|
||||
serialize_duration, Details, IndexSwap, Kind, KindWithContent, Status, Task,
|
||||
};
|
||||
use meilisearch_types::task_view::TaskView;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status};
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
@@ -37,140 +35,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
.service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks))))
|
||||
.service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task))));
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct TaskView {
|
||||
pub uid: TaskId,
|
||||
#[serde(default)]
|
||||
pub index_uid: Option<String>,
|
||||
pub status: Status,
|
||||
#[serde(rename = "type")]
|
||||
pub kind: Kind,
|
||||
pub canceled_by: Option<TaskId>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub details: Option<DetailsView>,
|
||||
pub error: Option<ResponseError>,
|
||||
#[serde(serialize_with = "serialize_duration", default)]
|
||||
pub duration: Option<Duration>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub enqueued_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub started_at: Option<OffsetDateTime>,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
|
||||
impl TaskView {
|
||||
pub fn from_task(task: &Task) -> TaskView {
|
||||
TaskView {
|
||||
uid: task.uid,
|
||||
index_uid: task.index_uid().map(ToOwned::to_owned),
|
||||
status: task.status,
|
||||
kind: task.kind.as_kind(),
|
||||
canceled_by: task.canceled_by,
|
||||
details: task.details.clone().map(DetailsView::from),
|
||||
error: task.error.clone(),
|
||||
duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start),
|
||||
enqueued_at: task.enqueued_at,
|
||||
started_at: task.started_at,
|
||||
finished_at: task.finished_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub received_documents: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub indexed_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary_key: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub provided_ids: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub matched_tasks: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub canceled_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub original_filter: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub dump_uid: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
pub settings: Option<Box<Settings<Unchecked>>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub swaps: Option<Vec<IndexSwap>>,
|
||||
}
|
||||
|
||||
impl From<Details> for DetailsView {
|
||||
fn from(details: Details) -> Self {
|
||||
match details {
|
||||
Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => {
|
||||
DetailsView {
|
||||
received_documents: Some(received_documents),
|
||||
indexed_documents: Some(indexed_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::SettingsUpdate { settings } => {
|
||||
DetailsView { settings: Some(settings), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexInfo { primary_key } => {
|
||||
DetailsView { primary_key: Some(primary_key), ..DetailsView::default() }
|
||||
}
|
||||
Details::DocumentDeletion {
|
||||
provided_ids: received_document_ids,
|
||||
deleted_documents,
|
||||
} => DetailsView {
|
||||
provided_ids: Some(received_document_ids),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
original_filter: Some(None),
|
||||
..DetailsView::default()
|
||||
},
|
||||
Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
|
||||
DetailsView {
|
||||
provided_ids: Some(0),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::ClearAll { deleted_documents } => {
|
||||
DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() }
|
||||
}
|
||||
Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
canceled_tasks: Some(canceled_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
deleted_tasks: Some(deleted_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::Dump { dump_uid } => {
|
||||
DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexSwap { swaps } => {
|
||||
DetailsView { swaps: Some(swaps), ..Default::default() }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
#[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct TasksFilterQuery {
|
||||
|
||||
@@ -735,6 +735,9 @@ pub fn perform_facet_search(
|
||||
if let Some(facet_query) = &facet_query {
|
||||
facet_search.query(facet_query);
|
||||
}
|
||||
if let Some(max_facets) = index.max_values_per_facet(&rtxn)? {
|
||||
facet_search.max_values(max_facets as usize);
|
||||
}
|
||||
|
||||
Ok(FacetSearchResult {
|
||||
facet_hits: facet_search.execute()?,
|
||||
@@ -897,6 +900,14 @@ fn format_fields<'a>(
|
||||
let mut matches_position = compute_matches.then(BTreeMap::new);
|
||||
let mut document = document.clone();
|
||||
|
||||
// reduce the formatted option list to the attributes that should be formatted,
|
||||
// instead of all the attributes to display.
|
||||
let formatting_fields_options: Vec<_> = formatted_options
|
||||
.iter()
|
||||
.filter(|(_, option)| option.should_format())
|
||||
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
|
||||
.collect();
|
||||
|
||||
// select the attributes to retrieve
|
||||
let displayable_names =
|
||||
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
|
||||
@@ -905,13 +916,15 @@ fn format_fields<'a>(
|
||||
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
|
||||
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
|
||||
// highlighted.
|
||||
let format = formatted_options
|
||||
// Warn: The time to compute the format list scales with the number of fields to format;
|
||||
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
|
||||
// d*f where d is the total number of fields to display and f is the total number of fields to format.
|
||||
let format = formatting_fields_options
|
||||
.iter()
|
||||
.filter(|(field, _option)| {
|
||||
let name = field_ids_map.name(**field).unwrap();
|
||||
.filter(|(name, _option)| {
|
||||
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
|
||||
})
|
||||
.map(|(_, option)| *option)
|
||||
.map(|(_, option)| **option)
|
||||
.reduce(|acc, option| acc.merge(option));
|
||||
let mut infos = Vec::new();
|
||||
|
||||
@@ -1008,7 +1021,7 @@ fn format_value<'a>(
|
||||
let value = matcher.format(format_options);
|
||||
Value::String(value.into_owned())
|
||||
}
|
||||
None => Value::Number(number),
|
||||
None => Value::String(s),
|
||||
}
|
||||
}
|
||||
value => value,
|
||||
|
||||
@@ -59,7 +59,7 @@ async fn import_dump_v1_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -77,8 +77,7 @@ async fn import_dump_v1_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -221,7 +220,7 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -239,8 +238,7 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -369,7 +367,7 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -387,8 +385,7 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -503,7 +500,7 @@ async fn import_dump_v2_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -521,8 +518,7 @@ async fn import_dump_v2_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -649,7 +645,7 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -667,8 +663,7 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -794,7 +789,7 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -812,8 +807,7 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -928,7 +922,7 @@ async fn import_dump_v3_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -946,8 +940,7 @@ async fn import_dump_v3_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1074,7 +1067,7 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -1092,8 +1085,7 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1219,7 +1211,7 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -1237,8 +1229,7 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1353,7 +1344,7 @@ async fn import_dump_v4_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -1371,8 +1362,7 @@ async fn import_dump_v4_movie_raw() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1499,7 +1489,7 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -1517,8 +1507,7 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1644,7 +1633,7 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@@ -1662,8 +1651,7 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
@@ -1907,8 +1895,7 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
},
|
||||
"embedders": {}
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
|
||||
@@ -105,6 +105,24 @@ async fn more_advanced_facet_search() {
|
||||
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_facet_search_with_max_values() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await;
|
||||
index.update_settings_filterable_attributes(json!(["genres"])).await;
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
let (response, code) =
|
||||
index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;
|
||||
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn non_filterable_facet_search_error() {
|
||||
let server = Server::new().await;
|
||||
|
||||
@@ -21,9 +21,9 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(
|
||||
json!({ "embedders": {"default": {"source": {"userProvided": {"dimensions": 2}}}} }),
|
||||
)
|
||||
.update_settings(json!({ "embedders": {"default": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 2}}} ))
|
||||
.await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
@@ -56,6 +56,15 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
}])
|
||||
});
|
||||
|
||||
static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
|
||||
json!([{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
"_vectors": {"default": [1.0, 3.0]},
|
||||
}])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search() {
|
||||
let server = Server::new().await;
|
||||
@@ -149,3 +158,18 @@ async fn invalid_semantic_ratio() {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn single_document() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SINGLE_DOCUMENT).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
|
||||
)
|
||||
.await;
|
||||
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###);
|
||||
}
|
||||
|
||||
@@ -890,13 +890,21 @@ async fn experimental_feature_vector_store() {
|
||||
let (response, code) = index
|
||||
.update_settings(json!({"embedders": {
|
||||
"manual": {
|
||||
"source": {
|
||||
"userProvided": {"dimensions": 3}
|
||||
}
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
}
|
||||
}}))
|
||||
.await;
|
||||
|
||||
meili_snap::snapshot!(response, @r###"
|
||||
{
|
||||
"taskUid": 1,
|
||||
"indexUid": "test",
|
||||
"status": "enqueued",
|
||||
"type": "settingsUpdate",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
meili_snap::snapshot!(code, @"202 Accepted");
|
||||
let response = index.wait_task(response.uid()).await;
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ async fn get_settings() {
|
||||
let (response, code) = index.settings().await;
|
||||
assert_eq!(code, 200);
|
||||
let settings = response.as_object().unwrap();
|
||||
assert_eq!(settings.keys().len(), 16);
|
||||
assert_eq!(settings.keys().len(), 15);
|
||||
assert_eq!(settings["displayedAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["searchableAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["filterableAttributes"], json!([]));
|
||||
@@ -83,7 +83,7 @@ async fn get_settings() {
|
||||
"maxTotalHits": 1000,
|
||||
})
|
||||
);
|
||||
assert_eq!(settings["embedders"], json!({}));
|
||||
assert_eq!(settings["proximityPrecision"], json!("byWord"));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
mod errors;
|
||||
mod webhook;
|
||||
|
||||
use meili_snap::insta::assert_json_snapshot;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
||||
123
meilisearch/tests/tasks/webhook.rs
Normal file
123
meilisearch/tests/tasks/webhook.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
//! To test the webhook, we need to spawn a new server with a URL listening for
|
||||
//! post requests. The webhook handle starts a server and forwards all the
|
||||
//! received requests into a channel for you to handle.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use actix_http::body::MessageBody;
|
||||
use actix_web::dev::{ServiceFactory, ServiceResponse};
|
||||
use actix_web::web::{Bytes, Data};
|
||||
use actix_web::{post, App, HttpResponse, HttpServer};
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
use tokio::sync::mpsc;
|
||||
use url::Url;
|
||||
|
||||
use crate::common::{default_settings, Server};
|
||||
use crate::json;
|
||||
|
||||
#[post("/")]
|
||||
async fn forward_body(sender: Data<mpsc::UnboundedSender<Vec<u8>>>, body: Bytes) -> HttpResponse {
|
||||
let body = body.to_vec();
|
||||
sender.send(body).unwrap();
|
||||
HttpResponse::Ok().into()
|
||||
}
|
||||
|
||||
fn create_app(
|
||||
sender: Arc<mpsc::UnboundedSender<Vec<u8>>>,
|
||||
) -> actix_web::App<
|
||||
impl ServiceFactory<
|
||||
actix_web::dev::ServiceRequest,
|
||||
Config = (),
|
||||
Response = ServiceResponse<impl MessageBody>,
|
||||
Error = actix_web::Error,
|
||||
InitError = (),
|
||||
>,
|
||||
> {
|
||||
App::new().service(forward_body).app_data(Data::from(sender))
|
||||
}
|
||||
|
||||
struct WebhookHandle {
|
||||
pub server_handle: tokio::task::JoinHandle<Result<(), std::io::Error>>,
|
||||
pub url: String,
|
||||
pub receiver: mpsc::UnboundedReceiver<Vec<u8>>,
|
||||
}
|
||||
|
||||
async fn create_webhook_server() -> WebhookHandle {
|
||||
let mut log_builder = env_logger::Builder::new();
|
||||
log_builder.parse_filters("info");
|
||||
log_builder.init();
|
||||
|
||||
let (sender, receiver) = mpsc::unbounded_channel();
|
||||
let sender = Arc::new(sender);
|
||||
|
||||
// By listening on the port 0, the system will give us any available port.
|
||||
let server =
|
||||
HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap();
|
||||
let (ip, scheme) = server.addrs_with_scheme()[0];
|
||||
let url = format!("{scheme}://{ip}/");
|
||||
|
||||
let server_handle = tokio::spawn(server.run());
|
||||
WebhookHandle { server_handle, url, receiver }
|
||||
}
|
||||
|
||||
#[actix_web::test]
|
||||
async fn test_basic_webhook() {
|
||||
let WebhookHandle { server_handle, url, mut receiver } = create_webhook_server().await;
|
||||
|
||||
let db_path = tempfile::tempdir().unwrap();
|
||||
let server = Server::new_with_options(Opt {
|
||||
task_webhook_url: Some(Url::parse(&url).unwrap()),
|
||||
..default_settings(db_path.path())
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let index = server.index("tamo");
|
||||
// May be flaky: we're relying on the fact that while the first document addition is processed, the other
|
||||
// operations will be received and will be batched together. If it doesn't happen it's not a problem
|
||||
// the rest of the test won't assume anything about the number of tasks per batch.
|
||||
for i in 0..5 {
|
||||
let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await;
|
||||
}
|
||||
|
||||
let mut nb_tasks = 0;
|
||||
while let Some(payload) = receiver.recv().await {
|
||||
let payload = String::from_utf8(payload).unwrap();
|
||||
let jsonl = payload.split('\n');
|
||||
for json in jsonl {
|
||||
if json.is_empty() {
|
||||
break; // we reached EOF
|
||||
}
|
||||
nb_tasks += 1;
|
||||
let json: serde_json::Value = serde_json::from_str(json).unwrap();
|
||||
snapshot!(
|
||||
json_string!(json, { ".uid" => "[uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"indexUid": "tamo",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
if nb_tasks == 5 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(nb_tasks == 5, "We should have received the 5 tasks but only received {nb_tasks}");
|
||||
|
||||
server_handle.abort();
|
||||
}
|
||||
@@ -192,7 +192,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
MissingDocumentField(#[from] crate::prompt::error::RenderPromptError),
|
||||
#[error(transparent)]
|
||||
InvalidPrompt(#[from] crate::prompt::error::NewPromptError),
|
||||
#[error("Invalid prompt in for embeddings with name '{0}': {1}.")]
|
||||
#[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")]
|
||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||
#[error("Too many embedders in the configuration. Found {0}, but limited to 256.")]
|
||||
TooManyEmbedders(usize),
|
||||
@@ -200,6 +200,33 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
InvalidEmbedder(String),
|
||||
#[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")]
|
||||
TooManyVectors(String, usize),
|
||||
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}",
|
||||
allowed_sources_for_field
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),
|
||||
allowed_fields_for_source
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ")
|
||||
)]
|
||||
InvalidFieldForSource {
|
||||
embedder_name: String,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
field: &'static str,
|
||||
allowed_fields_for_source: &'static [&'static str],
|
||||
allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource],
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())]
|
||||
InvalidOpenAiModel { embedder_name: String, model: String },
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")]
|
||||
MissingFieldForSource {
|
||||
field: &'static str,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
embedder_name: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<crate::vector::Error> for Error {
|
||||
|
||||
@@ -27,8 +27,8 @@ static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
|
||||
/// The maximum number of facets returned by the facet search route.
|
||||
const MAX_NUMBER_OF_FACETS: usize = 100;
|
||||
/// The maximum number of values per facet returned by the facet search route.
|
||||
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
@@ -306,6 +306,7 @@ pub struct SearchForFacetValues<'a> {
|
||||
query: Option<String>,
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
max_values: usize,
|
||||
is_hybrid: bool,
|
||||
}
|
||||
|
||||
@@ -315,7 +316,13 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
search_query: Search<'a>,
|
||||
is_hybrid: bool,
|
||||
) -> SearchForFacetValues<'a> {
|
||||
SearchForFacetValues { query: None, facet, search_query, is_hybrid }
|
||||
SearchForFacetValues {
|
||||
query: None,
|
||||
facet,
|
||||
search_query,
|
||||
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
|
||||
is_hybrid,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
|
||||
@@ -323,6 +330,11 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values = max;
|
||||
self
|
||||
}
|
||||
|
||||
fn one_original_value_of(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
@@ -462,7 +474,7 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
.unwrap_or_else(|| left_bound.to_string());
|
||||
results.push(FacetValueHit { value, count });
|
||||
}
|
||||
if results.len() >= MAX_NUMBER_OF_FACETS {
|
||||
if results.len() >= self.max_values {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -507,7 +519,7 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
.unwrap_or_else(|| query.to_string());
|
||||
results.push(FacetValueHit { value, count });
|
||||
}
|
||||
if results.len() >= MAX_NUMBER_OF_FACETS {
|
||||
if results.len() >= self.max_values {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ pub struct BucketSortOutput {
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
|
||||
@@ -72,7 +72,7 @@ impl<'m> MatcherBuilder<'m> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
pub crop: Option<usize>,
|
||||
@@ -82,6 +82,10 @@ impl FormatOptions {
|
||||
pub fn merge(self, other: Self) -> Self {
|
||||
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
|
||||
}
|
||||
|
||||
pub fn should_format(&self) -> bool {
|
||||
self.highlight || self.crop.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
|
||||
@@ -191,6 +191,7 @@ fn resolve_maximally_reduced_query_graph(
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext,
|
||||
initial_universe: &RoaringBitmap,
|
||||
@@ -556,6 +557,7 @@ pub fn execute_vector_search(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: Option<&str>,
|
||||
|
||||
@@ -5,6 +5,7 @@ use super::*;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[logging_timer::time]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter,
|
||||
|
||||
@@ -26,7 +26,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
@@ -181,11 +181,11 @@ fn searchable_fields_changed(
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
@@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
|
||||
@@ -14,7 +14,6 @@ use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use log::debug;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
@@ -29,10 +28,7 @@ use self::extract_vector_points::{
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||
MergeFn, MergeableReader,
|
||||
};
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
@@ -51,7 +47,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
stop_words: Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
@@ -61,218 +57,170 @@ pub(crate) fn data_from_obkv_documents(
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
(
|
||||
fid_docid_facet_strings_chunks,
|
||||
(
|
||||
facet_is_null_docids_chunks,
|
||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||
),
|
||||
),
|
||||
),
|
||||
) = result?;
|
||||
|
||||
// merge facet_exists_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-exists-docids");
|
||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_null_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_empty_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>,
|
||||
>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
|| {
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.inspect(|result| {
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
if let Ok((docid_word_positions_chunk, _)) = result {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
}
|
||||
})
|
||||
.inspect(|result| {
|
||||
if let Ok((docid_word_positions_chunk, _)) = result {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
}
|
||||
})
|
||||
.inspect(|result| {
|
||||
if let Ok((docid_word_positions_chunk, _)) = result {
|
||||
let exact_attributes = exact_attributes.clone();
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
),
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| {
|
||||
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
|
||||
},
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
}
|
||||
})
|
||||
.inspect(|result| {
|
||||
if let Ok((docid_word_positions_chunk, _)) = result {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
}
|
||||
})
|
||||
.inspect(|result| {
|
||||
if let Ok((_, (_, fid_docid_facet_strings_chunk))) = result {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
}
|
||||
})
|
||||
.inspect(|result| {
|
||||
if let Ok((_, (fid_docid_facet_numbers_chunk, _))) = result {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
}
|
||||
})
|
||||
.map(|r| r.map(|_| ()))
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_strings_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx,
|
||||
extract_facet_number_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
original_pipeline_result.and(flattened_pipeline_result)
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn spawn_extraction_task<FE, FS, M>(
|
||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
merge_fn: MergeFn,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
|
||||
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
|
||||
M::Output: Send,
|
||||
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||
M: Send,
|
||||
{
|
||||
rayon::spawn(move || {
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
let chunks: Result<M> =
|
||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
|
||||
rayon::spawn(move || match chunks {
|
||||
Ok(chunks) => {
|
||||
debug!("merge {} database", name);
|
||||
puffin::profile_scope!("merge_multiple_chunks", name);
|
||||
let reader = chunks.merge(merge_fn, &indexer);
|
||||
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
})
|
||||
});
|
||||
puffin::profile_scope!("extract_chunk", name);
|
||||
match extract_fn(chunk, indexer) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
@@ -350,22 +298,13 @@ fn send_and_extract_flattened_documents_data(
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
stop_words: &Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: &Option<&[&str]>,
|
||||
dictionary: &Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
),
|
||||
),
|
||||
),
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@@ -436,16 +375,17 @@ fn send_and_extract_flattened_documents_data(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
Ok((
|
||||
fid_docid_facet_numbers_chunk,
|
||||
(
|
||||
fid_docid_facet_strings_chunk,
|
||||
(
|
||||
fid_facet_is_null_docids_chunk,
|
||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||
),
|
||||
),
|
||||
))
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||
|
||||
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
|
||||
@@ -82,90 +82,6 @@ pub unsafe fn as_cloneable_grenad(
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
pub trait MergeableReader
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
type Output;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||
type Output = grenad::Reader<BufReader<File>>;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut merger = MergerBuilder::new(merge_fn);
|
||||
self.into_iter().try_for_each(|r| merger.push(r))?;
|
||||
merger.finish(params)
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
let mut m2 = MergerBuilder::new(merge_fn);
|
||||
for (r1, r2) in self.into_iter() {
|
||||
m1.push(r1)?;
|
||||
m2.push(r2)?;
|
||||
}
|
||||
Ok((m1.finish(params)?, m2.finish(params)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader
|
||||
for Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>
|
||||
{
|
||||
type Output = (
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
let mut m2 = MergerBuilder::new(merge_fn);
|
||||
let mut m3 = MergerBuilder::new(merge_fn);
|
||||
for (r1, r2, r3) in self.into_iter() {
|
||||
m1.push(r1)?;
|
||||
m2.push(r2)?;
|
||||
m3.push(r3)?;
|
||||
}
|
||||
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
||||
}
|
||||
}
|
||||
|
||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||
|
||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||
fn new(merge_fn: MergeFn) -> Self {
|
||||
Self(grenad::MergerBuilder::new(merge_fn))
|
||||
}
|
||||
|
||||
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
|
||||
self.0.push(reader.into_cursor()?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let merger = self.0.build();
|
||||
let mut writer = create_writer(
|
||||
params.chunk_compression_type,
|
||||
params.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
merger.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
|
||||
@@ -10,13 +10,13 @@ use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters, MergeableReader,
|
||||
GrenadParameters,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
|
||||
obkvs_merge_additions_and_deletions, MergeFn,
|
||||
};
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
@@ -5,12 +5,13 @@ mod transform;
|
||||
mod typed_chunk;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Cursor, Read, Seek};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Str;
|
||||
use heed::Database;
|
||||
use log::debug;
|
||||
@@ -21,7 +22,7 @@ use slice_group_by::GroupBy;
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
|
||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
@@ -313,9 +314,6 @@ where
|
||||
}
|
||||
};
|
||||
|
||||
let original_documents = grenad::Reader::new(original_documents)?;
|
||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||
|
||||
// create LMDB writer channel
|
||||
let (lmdb_writer_sx, lmdb_writer_rx): (
|
||||
Sender<Result<TypedChunk>>,
|
||||
@@ -354,11 +352,7 @@ where
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
||||
|
||||
@@ -368,55 +362,77 @@ where
|
||||
max_memory: self.indexer_config.max_memory,
|
||||
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
||||
};
|
||||
let documents_chunk_size =
|
||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
|
||||
let documents_chunk_size = match self.indexer_config.documents_chunk_size {
|
||||
Some(chunk_size) => chunk_size,
|
||||
None => {
|
||||
let default_chunk_size = 1024 * 1024 * 4; // 4MiB
|
||||
let min_chunk_size = 1024 * 512; // 512KiB
|
||||
|
||||
// compute the chunk size from the number of available threads and the inputed data size.
|
||||
let total_size = flattened_documents.metadata().map(|m| m.len());
|
||||
let current_num_threads = pool.current_num_threads();
|
||||
total_size
|
||||
.map_or(default_chunk_size, |size| (size as usize) / current_num_threads)
|
||||
.max(min_chunk_size)
|
||||
}
|
||||
};
|
||||
|
||||
let original_documents = grenad::Reader::new(original_documents)?;
|
||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||
|
||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||
|
||||
let cloned_embedder = self.embedders.clone();
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
pool.install(|| {
|
||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||
// split obkv file into several chunks
|
||||
let original_chunk_iter =
|
||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||
rayon::spawn(move || {
|
||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||
// split obkv file into several chunks
|
||||
let original_chunk_iter =
|
||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
// extract all databases from the chunked obkv douments
|
||||
extract::data_from_obkv_documents(
|
||||
original_chunk,
|
||||
flattened_chunk,
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
// extract all databases from the chunked obkv douments
|
||||
extract::data_from_obkv_documents(
|
||||
original_chunk,
|
||||
flattened_chunk,
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx);
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx);
|
||||
});
|
||||
|
||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
|
||||
let mut databases_seen = 0;
|
||||
@@ -444,12 +460,21 @@ where
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let word_docids = word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
let exact_word_docids = exact_word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
@@ -458,7 +483,10 @@ where
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
let word_position_docids = word_position_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
TypedChunk::VectorPoints {
|
||||
@@ -481,7 +509,7 @@ where
|
||||
};
|
||||
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
|
||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn)?;
|
||||
if !docids.is_empty() {
|
||||
final_documents_ids |= docids;
|
||||
let documents_seen_count = final_documents_ids.len();
|
||||
@@ -538,10 +566,10 @@ where
|
||||
}
|
||||
|
||||
self.execute_prefix_databases(
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
word_docids.map(MergerBuilder::build),
|
||||
exact_word_docids.map(MergerBuilder::build),
|
||||
word_position_docids.map(MergerBuilder::build),
|
||||
word_fid_docids.map(MergerBuilder::build),
|
||||
)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
@@ -550,10 +578,10 @@ where
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute_prefix_databases(
|
||||
self,
|
||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
@@ -728,7 +756,7 @@ where
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn execute_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
indexer_config: &IndexerConfig,
|
||||
@@ -738,13 +766,12 @@ fn execute_word_prefix_docids(
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let cursor = reader.into_cursor()?;
|
||||
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
||||
builder.max_nb_chunks = indexer_config.max_nb_chunks;
|
||||
builder.max_memory = indexer_config.max_memory;
|
||||
builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
||||
builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2553,7 +2580,7 @@ mod tests {
|
||||
/// Vectors must be of the same length.
|
||||
#[test]
|
||||
fn test_multiple_vectors() {
|
||||
use crate::vector::settings::{EmbedderSettings, EmbeddingSettings};
|
||||
use crate::vector::settings::EmbeddingSettings;
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
@@ -2562,9 +2589,11 @@ mod tests {
|
||||
embedders.insert(
|
||||
"manual".to_string(),
|
||||
Setting::Set(EmbeddingSettings {
|
||||
embedder_options: Setting::Set(EmbedderSettings::UserProvided(
|
||||
crate::vector::settings::UserProvidedSettings { dimensions: 3 },
|
||||
)),
|
||||
source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided),
|
||||
model: Setting::NotSet,
|
||||
revision: Setting::NotSet,
|
||||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::Set(3),
|
||||
document_template: Setting::NotSet,
|
||||
}),
|
||||
);
|
||||
@@ -2579,10 +2608,10 @@ mod tests {
|
||||
.unwrap();
|
||||
index.add_documents(documents!([{"id": 1, "_vectors": { "manual": [6, 7, 8] }}])).unwrap();
|
||||
index
|
||||
.add_documents(
|
||||
documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
|
||||
)
|
||||
.unwrap();
|
||||
.add_documents(
|
||||
documents!([{"id": 2, "_vectors": { "manual": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }}]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let res = index.search(&rtxn).vector([0.0, 1.0, 2.0].to_vec()).execute().unwrap();
|
||||
|
||||
@@ -7,7 +7,7 @@ use bytemuck::allocation::pod_collect_to_vec;
|
||||
use charabia::{Language, Script};
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::Bytes;
|
||||
use heed::{PutFlags, RwTxn};
|
||||
use heed::RwTxn;
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
@@ -119,7 +119,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
typed_chunk: TypedChunk,
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
puffin::profile_function!(typed_chunk.to_debug_string());
|
||||
|
||||
@@ -172,11 +171,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.put_documents_ids(wtxn, &docids)?;
|
||||
}
|
||||
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
fid_word_count_docids_iter,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
@@ -188,31 +186,28 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
word_docids_iter.clone(),
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
exact_word_docids_iter.clone(),
|
||||
&index.exact_word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
word_fid_docids_iter,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
@@ -230,11 +225,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPositionDocids(word_position_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
word_position_docids_iter,
|
||||
&index.word_position_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
@@ -251,44 +245,40 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
facet_id_exists_docids,
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
facet_id_is_null_docids,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
facet_id_is_empty_docids,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
append_entries_into_database(
|
||||
write_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
@@ -541,7 +531,6 @@ fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
data: grenad::Reader<R>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
@@ -559,13 +548,9 @@ where
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
let value = match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
@@ -578,58 +563,3 @@ where
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
/// All provided entries must be ordered.
|
||||
/// If the index is not empty, write_entries_into_database is called instead.
|
||||
fn append_entries_into_database<R, K, V, FS, FM>(
|
||||
data: grenad::Reader<R>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
K: for<'a> heed::BytesDecode<'a>,
|
||||
{
|
||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||
|
||||
if !index_is_empty {
|
||||
return write_entries_into_database(
|
||||
data,
|
||||
database,
|
||||
wtxn,
|
||||
false,
|
||||
serialize_value,
|
||||
merge_values,
|
||||
);
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = database.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut cursor = data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
debug_assert!(
|
||||
K::bytes_decode(key).is_ok(),
|
||||
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
||||
key.len(),
|
||||
&key
|
||||
);
|
||||
buffer.clear();
|
||||
let value = serialize_value(value, &mut buffer)?;
|
||||
unsafe {
|
||||
// safety: We do not keep a reference to anything that lives inside the database
|
||||
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, value)?
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ pub use self::index_documents::{
|
||||
MergeFn,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
use crate::vector::settings::{EmbeddingSettings, PromptSettings};
|
||||
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||
use crate::{FieldsIdsMap, Index, OrderBy, Result};
|
||||
|
||||
@@ -78,11 +78,19 @@ impl<T> Setting<T> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
/// Returns `true` if applying the new setting changed this setting
|
||||
pub fn apply(&mut self, new: Self) -> bool
|
||||
where
|
||||
T: PartialEq + Eq,
|
||||
{
|
||||
if let Setting::NotSet = new {
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
if self == &new {
|
||||
return false;
|
||||
}
|
||||
*self = new;
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -950,17 +958,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
||||
{
|
||||
match joined {
|
||||
// updated config
|
||||
EitherOrBoth::Both((name, mut old), (_, new)) => {
|
||||
old.apply(new);
|
||||
let new = validate_prompt(&name, old)?;
|
||||
changed = true;
|
||||
changed |= old.apply(new);
|
||||
let new = validate_embedding_settings(old, &name)?;
|
||||
new_configs.insert(name, new);
|
||||
}
|
||||
// unchanged config
|
||||
EitherOrBoth::Left((name, setting)) => {
|
||||
new_configs.insert(name, setting);
|
||||
}
|
||||
EitherOrBoth::Right((name, setting)) => {
|
||||
let setting = validate_prompt(&name, setting)?;
|
||||
// new config
|
||||
EitherOrBoth::Right((name, mut setting)) => {
|
||||
// apply the default source in case the source was not set so that it gets validated
|
||||
crate::vector::settings::EmbeddingSettings::apply_default_source(
|
||||
&mut setting,
|
||||
);
|
||||
let setting = validate_embedding_settings(setting, &name)?;
|
||||
changed = true;
|
||||
new_configs.insert(name, setting);
|
||||
}
|
||||
@@ -1072,8 +1086,12 @@ fn validate_prompt(
|
||||
) -> Result<Setting<EmbeddingSettings>> {
|
||||
match new {
|
||||
Setting::Set(EmbeddingSettings {
|
||||
embedder_options,
|
||||
document_template: Setting::Set(PromptSettings { template: Setting::Set(template) }),
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template: Setting::Set(template),
|
||||
}) => {
|
||||
// validate
|
||||
let template = crate::prompt::Prompt::new(template)
|
||||
@@ -1081,16 +1099,71 @@ fn validate_prompt(
|
||||
.map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?;
|
||||
|
||||
Ok(Setting::Set(EmbeddingSettings {
|
||||
embedder_options,
|
||||
document_template: Setting::Set(PromptSettings {
|
||||
template: Setting::Set(template),
|
||||
}),
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template: Setting::Set(template),
|
||||
}))
|
||||
}
|
||||
new => Ok(new),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_embedding_settings(
|
||||
settings: Setting<EmbeddingSettings>,
|
||||
name: &str,
|
||||
) -> Result<Setting<EmbeddingSettings>> {
|
||||
let settings = validate_prompt(name, settings)?;
|
||||
let Setting::Set(settings) = settings else { return Ok(settings) };
|
||||
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
|
||||
settings;
|
||||
let Some(inferred_source) = source.set() else {
|
||||
return Ok(Setting::Set(EmbeddingSettings {
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
}));
|
||||
};
|
||||
match inferred_source {
|
||||
EmbedderSource::OpenAi => {
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
check_unset(&dimensions, "dimensions", inferred_source, name)?;
|
||||
if let Setting::Set(model) = &model {
|
||||
crate::vector::openai::EmbeddingModel::from_name(model.as_str()).ok_or(
|
||||
crate::error::UserError::InvalidOpenAiModel {
|
||||
embedder_name: name.to_owned(),
|
||||
model: model.clone(),
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
EmbedderSource::HuggingFace => {
|
||||
check_unset(&api_key, "apiKey", inferred_source, name)?;
|
||||
check_unset(&dimensions, "dimensions", inferred_source, name)?;
|
||||
}
|
||||
EmbedderSource::UserProvided => {
|
||||
check_unset(&model, "model", inferred_source, name)?;
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
check_unset(&api_key, "apiKey", inferred_source, name)?;
|
||||
check_unset(&document_template, "documentTemplate", inferred_source, name)?;
|
||||
check_set(&dimensions, "dimensions", inferred_source, name)?;
|
||||
}
|
||||
}
|
||||
Ok(Setting::Set(EmbeddingSettings {
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
|
||||
@@ -42,7 +42,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||
#[logging_timer::time("WordPrefixDocids::{}")]
|
||||
pub fn execute(
|
||||
self,
|
||||
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
||||
new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
@@ -63,7 +63,8 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
||||
let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?;
|
||||
while let Some((word, data)) = new_word_docids_iter.next()? {
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
|
||||
_otherwise => {
|
||||
|
||||
@@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||
#[logging_timer::time("WordPrefixIntegerDocids::{}")]
|
||||
pub fn execute(
|
||||
self,
|
||||
new_word_integer_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
@@ -64,14 +64,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?;
|
||||
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
// We fetch all the new common prefixes between the previous and new prefix fst.
|
||||
let mut buffer = Vec::new();
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? {
|
||||
let mut new_word_integer_docids_iter =
|
||||
new_word_integer_docids.into_stream_merger_iter()?;
|
||||
while let Some((key, data)) = new_word_integer_docids_iter.next()? {
|
||||
let (word, pos) =
|
||||
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
|
||||
|
||||
@@ -34,6 +34,9 @@ pub struct EmbedderOptions {
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum EmbeddingModel {
|
||||
// # WARNING
|
||||
//
|
||||
// If ever adding a model, make sure to add it to the list of supported models below.
|
||||
#[default]
|
||||
#[serde(rename = "text-embedding-ada-002")]
|
||||
#[deserr(rename = "text-embedding-ada-002")]
|
||||
@@ -41,6 +44,10 @@ pub enum EmbeddingModel {
|
||||
}
|
||||
|
||||
impl EmbeddingModel {
|
||||
pub fn supported_models() -> &'static [&'static str] {
|
||||
&["text-embedding-ada-002"]
|
||||
}
|
||||
|
||||
pub fn max_token(&self) -> usize {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => 8191,
|
||||
@@ -59,7 +66,7 @@ impl EmbeddingModel {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_name(name: &'static str) -> Option<Self> {
|
||||
pub fn from_name(name: &str) -> Option<Self> {
|
||||
match name {
|
||||
"text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002),
|
||||
_ => None,
|
||||
|
||||
@@ -4,32 +4,189 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::prompt::PromptData;
|
||||
use crate::update::Setting;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::UserError;
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct EmbeddingSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "source")]
|
||||
#[deserr(default, rename = "source")]
|
||||
pub embedder_options: Setting<EmbedderSettings>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub document_template: Setting<PromptSettings>,
|
||||
pub source: Setting<EmbedderSource>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub model: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub revision: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub api_key: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub dimensions: Setting<usize>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub document_template: Setting<String>,
|
||||
}
|
||||
|
||||
pub fn check_unset<T>(
|
||||
key: &Setting<T>,
|
||||
field: &'static str,
|
||||
source: EmbedderSource,
|
||||
embedder_name: &str,
|
||||
) -> Result<(), UserError> {
|
||||
if matches!(key, Setting::NotSet) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(UserError::InvalidFieldForSource {
|
||||
embedder_name: embedder_name.to_owned(),
|
||||
source_: source,
|
||||
field,
|
||||
allowed_fields_for_source: EmbeddingSettings::allowed_fields_for_source(source),
|
||||
allowed_sources_for_field: EmbeddingSettings::allowed_sources_for_field(field),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_set<T>(
|
||||
key: &Setting<T>,
|
||||
field: &'static str,
|
||||
source: EmbedderSource,
|
||||
embedder_name: &str,
|
||||
) -> Result<(), UserError> {
|
||||
if matches!(key, Setting::Set(_)) {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(UserError::MissingFieldForSource {
|
||||
field,
|
||||
source_: source,
|
||||
embedder_name: embedder_name.to_owned(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbeddingSettings {
|
||||
pub const SOURCE: &'static str = "source";
|
||||
pub const MODEL: &'static str = "model";
|
||||
pub const REVISION: &'static str = "revision";
|
||||
pub const API_KEY: &'static str = "apiKey";
|
||||
pub const DIMENSIONS: &'static str = "dimensions";
|
||||
pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate";
|
||||
|
||||
pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] {
|
||||
match field {
|
||||
Self::SOURCE => {
|
||||
&[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::UserProvided]
|
||||
}
|
||||
Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi],
|
||||
Self::REVISION => &[EmbedderSource::HuggingFace],
|
||||
Self::API_KEY => &[EmbedderSource::OpenAi],
|
||||
Self::DIMENSIONS => &[EmbedderSource::UserProvided],
|
||||
Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi],
|
||||
_other => unreachable!("unknown field"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] {
|
||||
match source {
|
||||
EmbedderSource::OpenAi => {
|
||||
&[Self::SOURCE, Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE]
|
||||
}
|
||||
EmbedderSource::HuggingFace => {
|
||||
&[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE]
|
||||
}
|
||||
EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn apply_default_source(setting: &mut Setting<EmbeddingSettings>) {
|
||||
if let Setting::Set(EmbeddingSettings {
|
||||
source: source @ (Setting::NotSet | Setting::Reset),
|
||||
..
|
||||
}) = setting
|
||||
{
|
||||
*source = Setting::Set(EmbedderSource::default())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum EmbedderSource {
|
||||
#[default]
|
||||
OpenAi,
|
||||
HuggingFace,
|
||||
UserProvided,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EmbedderSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
EmbedderSource::OpenAi => "openAi",
|
||||
EmbedderSource::HuggingFace => "huggingFace",
|
||||
EmbedderSource::UserProvided => "userProvided",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbeddingSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let EmbeddingSettings { embedder_options, document_template: prompt } = new;
|
||||
self.embedder_options.apply(embedder_options);
|
||||
self.document_template.apply(prompt);
|
||||
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
|
||||
new;
|
||||
let old_source = self.source;
|
||||
self.source.apply(source);
|
||||
// Reinitialize the whole setting object on a source change
|
||||
if old_source != self.source {
|
||||
*self = EmbeddingSettings {
|
||||
source,
|
||||
model,
|
||||
revision,
|
||||
api_key,
|
||||
dimensions,
|
||||
document_template,
|
||||
};
|
||||
return;
|
||||
}
|
||||
|
||||
self.model.apply(model);
|
||||
self.revision.apply(revision);
|
||||
self.api_key.apply(api_key);
|
||||
self.dimensions.apply(dimensions);
|
||||
self.document_template.apply(document_template);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
fn from(value: EmbeddingConfig) -> Self {
|
||||
Self {
|
||||
embedder_options: Setting::Set(value.embedder_options.into()),
|
||||
document_template: Setting::Set(value.prompt.into()),
|
||||
let EmbeddingConfig { embedder_options, prompt } = value;
|
||||
match embedder_options {
|
||||
super::EmbedderOptions::HuggingFace(options) => Self {
|
||||
source: Setting::Set(EmbedderSource::HuggingFace),
|
||||
model: Setting::Set(options.model),
|
||||
revision: options.revision.map(Setting::Set).unwrap_or_default(),
|
||||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::NotSet,
|
||||
document_template: Setting::Set(prompt.template),
|
||||
},
|
||||
super::EmbedderOptions::OpenAi(options) => Self {
|
||||
source: Setting::Set(EmbedderSource::OpenAi),
|
||||
model: Setting::Set(options.embedding_model.name().to_owned()),
|
||||
revision: Setting::NotSet,
|
||||
api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
|
||||
dimensions: Setting::NotSet,
|
||||
document_template: Setting::Set(prompt.template),
|
||||
},
|
||||
super::EmbedderOptions::UserProvided(options) => Self {
|
||||
source: Setting::Set(EmbedderSource::UserProvided),
|
||||
model: Setting::NotSet,
|
||||
revision: Setting::NotSet,
|
||||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::Set(options.dimensions),
|
||||
document_template: Setting::NotSet,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -37,256 +194,51 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
impl From<EmbeddingSettings> for EmbeddingConfig {
|
||||
fn from(value: EmbeddingSettings) -> Self {
|
||||
let mut this = Self::default();
|
||||
let EmbeddingSettings { embedder_options, document_template: prompt } = value;
|
||||
if let Some(embedder_options) = embedder_options.set() {
|
||||
this.embedder_options = embedder_options.into();
|
||||
}
|
||||
if let Some(prompt) = prompt.set() {
|
||||
this.prompt = prompt.into();
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct PromptSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub template: Setting<String>,
|
||||
}
|
||||
|
||||
impl PromptSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let PromptSettings { template } = new;
|
||||
self.template.apply(template);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PromptData> for PromptSettings {
|
||||
fn from(value: PromptData) -> Self {
|
||||
Self { template: Setting::Set(value.template) }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PromptSettings> for PromptData {
|
||||
fn from(value: PromptSettings) -> Self {
|
||||
let mut this = PromptData::default();
|
||||
let PromptSettings { template } = value;
|
||||
if let Some(template) = template.set() {
|
||||
this.template = template;
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
pub enum EmbedderSettings {
|
||||
HuggingFace(Setting<HfEmbedderSettings>),
|
||||
OpenAi(Setting<OpenAiEmbedderSettings>),
|
||||
UserProvided(UserProvidedSettings),
|
||||
}
|
||||
|
||||
impl<E> Deserr<E> for EmbedderSettings
|
||||
where
|
||||
E: deserr::DeserializeError,
|
||||
{
|
||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||
value: deserr::Value<V>,
|
||||
location: deserr::ValuePointerRef,
|
||||
) -> Result<Self, E> {
|
||||
match value {
|
||||
deserr::Value::Map(map) => {
|
||||
if deserr::Map::len(&map) != 1 {
|
||||
return Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::Unexpected {
|
||||
msg: format!(
|
||||
"Expected a single field, got {} fields",
|
||||
deserr::Map::len(&map)
|
||||
),
|
||||
},
|
||||
location,
|
||||
)));
|
||||
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
|
||||
value;
|
||||
if let Some(source) = source.set() {
|
||||
match source {
|
||||
EmbedderSource::OpenAi => {
|
||||
let mut options = super::openai::EmbedderOptions::with_default_model(None);
|
||||
if let Some(model) = model.set() {
|
||||
if let Some(model) = super::openai::EmbeddingModel::from_name(&model) {
|
||||
options.embedding_model = model;
|
||||
}
|
||||
}
|
||||
if let Some(api_key) = api_key.set() {
|
||||
options.api_key = Some(api_key);
|
||||
}
|
||||
this.embedder_options = super::EmbedderOptions::OpenAi(options);
|
||||
}
|
||||
let mut it = deserr::Map::into_iter(map);
|
||||
let (k, v) = it.next().unwrap();
|
||||
|
||||
match k.as_str() {
|
||||
"huggingFace" => Ok(EmbedderSettings::HuggingFace(Setting::Set(
|
||||
HfEmbedderSettings::deserialize_from_value(
|
||||
v.into_value(),
|
||||
location.push_key(&k),
|
||||
)?,
|
||||
))),
|
||||
"openAi" => Ok(EmbedderSettings::OpenAi(Setting::Set(
|
||||
OpenAiEmbedderSettings::deserialize_from_value(
|
||||
v.into_value(),
|
||||
location.push_key(&k),
|
||||
)?,
|
||||
))),
|
||||
"userProvided" => Ok(EmbedderSettings::UserProvided(
|
||||
UserProvidedSettings::deserialize_from_value(
|
||||
v.into_value(),
|
||||
location.push_key(&k),
|
||||
)?,
|
||||
)),
|
||||
other => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::UnknownKey {
|
||||
key: other,
|
||||
accepted: &["huggingFace", "openAi", "userProvided"],
|
||||
},
|
||||
location,
|
||||
))),
|
||||
EmbedderSource::HuggingFace => {
|
||||
let mut options = super::hf::EmbedderOptions::default();
|
||||
if let Some(model) = model.set() {
|
||||
options.model = model;
|
||||
// Reset the revision if we are setting the model.
|
||||
// This allows the following:
|
||||
// "huggingFace": {} -> default model with default revision
|
||||
// "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision
|
||||
// "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision
|
||||
options.revision = None;
|
||||
}
|
||||
if let Some(revision) = revision.set() {
|
||||
options.revision = Some(revision);
|
||||
}
|
||||
this.embedder_options = super::EmbedderOptions::HuggingFace(options);
|
||||
}
|
||||
EmbedderSource::UserProvided => {
|
||||
this.embedder_options =
|
||||
super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
|
||||
dimensions: dimensions.set().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => Err(deserr::take_cf_content(E::error::<V>(
|
||||
None,
|
||||
deserr::ErrorKind::IncorrectValueKind {
|
||||
actual: value,
|
||||
accepted: &[deserr::ValueKind::Map],
|
||||
},
|
||||
location,
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbedderSettings {
|
||||
fn default() -> Self {
|
||||
Self::OpenAi(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::EmbedderOptions> for EmbedderSettings {
|
||||
fn from(value: crate::vector::EmbedderOptions) -> Self {
|
||||
match value {
|
||||
crate::vector::EmbedderOptions::HuggingFace(hf) => {
|
||||
Self::HuggingFace(Setting::Set(hf.into()))
|
||||
}
|
||||
crate::vector::EmbedderOptions::OpenAi(openai) => {
|
||||
Self::OpenAi(Setting::Set(openai.into()))
|
||||
}
|
||||
crate::vector::EmbedderOptions::UserProvided(user_provided) => {
|
||||
Self::UserProvided(user_provided.into())
|
||||
}
|
||||
if let Setting::Set(template) = document_template {
|
||||
this.prompt = PromptData { template }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EmbedderSettings> for crate::vector::EmbedderOptions {
|
||||
fn from(value: EmbedderSettings) -> Self {
|
||||
match value {
|
||||
EmbedderSettings::HuggingFace(Setting::Set(hf)) => Self::HuggingFace(hf.into()),
|
||||
EmbedderSettings::HuggingFace(_setting) => Self::HuggingFace(Default::default()),
|
||||
EmbedderSettings::OpenAi(Setting::Set(ai)) => Self::OpenAi(ai.into()),
|
||||
EmbedderSettings::OpenAi(_setting) => {
|
||||
Self::OpenAi(crate::vector::openai::EmbedderOptions::with_default_model(None))
|
||||
}
|
||||
EmbedderSettings::UserProvided(user_provided) => {
|
||||
Self::UserProvided(user_provided.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct HfEmbedderSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub model: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub revision: Setting<String>,
|
||||
}
|
||||
|
||||
impl HfEmbedderSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let HfEmbedderSettings { model, revision } = new;
|
||||
self.model.apply(model);
|
||||
self.revision.apply(revision);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::hf::EmbedderOptions> for HfEmbedderSettings {
|
||||
fn from(value: crate::vector::hf::EmbedderOptions) -> Self {
|
||||
Self {
|
||||
model: Setting::Set(value.model),
|
||||
revision: value.revision.map(Setting::Set).unwrap_or(Setting::NotSet),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HfEmbedderSettings> for crate::vector::hf::EmbedderOptions {
|
||||
fn from(value: HfEmbedderSettings) -> Self {
|
||||
let HfEmbedderSettings { model, revision } = value;
|
||||
let mut this = Self::default();
|
||||
if let Some(model) = model.set() {
|
||||
this.model = model;
|
||||
}
|
||||
if let Some(revision) = revision.set() {
|
||||
this.revision = Some(revision);
|
||||
}
|
||||
this
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct OpenAiEmbedderSettings {
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
pub api_key: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set", rename = "model")]
|
||||
#[deserr(default, rename = "model")]
|
||||
pub embedding_model: Setting<crate::vector::openai::EmbeddingModel>,
|
||||
}
|
||||
|
||||
impl OpenAiEmbedderSettings {
|
||||
pub fn apply(&mut self, new: Self) {
|
||||
let Self { api_key, embedding_model: embedding_mode } = new;
|
||||
self.api_key.apply(api_key);
|
||||
self.embedding_model.apply(embedding_mode);
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::openai::EmbedderOptions> for OpenAiEmbedderSettings {
|
||||
fn from(value: crate::vector::openai::EmbedderOptions) -> Self {
|
||||
Self {
|
||||
api_key: value.api_key.map(Setting::Set).unwrap_or(Setting::Reset),
|
||||
embedding_model: Setting::Set(value.embedding_model),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenAiEmbedderSettings> for crate::vector::openai::EmbedderOptions {
|
||||
fn from(value: OpenAiEmbedderSettings) -> Self {
|
||||
let OpenAiEmbedderSettings { api_key, embedding_model } = value;
|
||||
Self { api_key: api_key.set(), embedding_model: embedding_model.set().unwrap_or_default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct UserProvidedSettings {
|
||||
pub dimensions: usize,
|
||||
}
|
||||
|
||||
impl From<UserProvidedSettings> for crate::vector::manual::EmbedderOptions {
|
||||
fn from(value: UserProvidedSettings) -> Self {
|
||||
Self { dimensions: value.dimensions }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::vector::manual::EmbedderOptions> for UserProvidedSettings {
|
||||
fn from(value: crate::vector::manual::EmbedderOptions) -> Self {
|
||||
Self { dimensions: value.dimensions }
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user