Compare commits

..

19 Commits

Author SHA1 Message Date
b3952e8b3d Only spawn request threads if necessary 2024-06-18 15:34:39 +02:00
c668043c4f Merge #4617
4617: Destructure `EmbedderOptions` so we don't miss some options r=dureuill a=dureuill

# Pull Request

## Related issue
#4595 was caused by the code not destructuring the embedder options.


## What does this PR do?
This PR adds the missing `url` parameter for ollama, and makes sure similar issue cannot happen in the future



Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-05-02 14:55:32 +00:00
5a305bfdea Remove unused struct 2024-05-02 16:14:37 +02:00
f4dd73ec8c Destructure EmbedderOptions so we don't miss some options 2024-05-02 15:39:36 +02:00
66dce4600d Merge #4603
4603: Update charabia v0.8.10 r=Kerollmops a=ManyTheFish

- Update Charabia v0.8.10
- Add `swedish-recomposition` as an optional feature flag

Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-30 13:04:02 +00:00
fe51ceca6d Update lock file 2024-04-30 14:33:37 +02:00
88174b8ae4 Update charabia v0.8.10 2024-04-30 14:30:23 +02:00
ebca29f3de Merge #4597
4597: Fix embeddings settings update r=ManyTheFish a=ManyTheFish

# Pull Request
- add some conditions reducing the work done when changing the settings
- add some benchmarks on embedders

## Related issue
Fixes #4585


Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-25 16:37:28 +00:00
c793b6ef6d Merge #4600
4600: Fix embedders api r=ManyTheFish a=ManyTheFish

# Pull Request

## Related issue
Fixes #4594
Fixes #4595


Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-25 13:16:33 +00:00
cbbfff3594 Remove debuging prints 2024-04-25 10:37:18 +02:00
dbcf50589b Fix clippy 2024-04-25 10:36:10 +02:00
3e5cd027a5 Merge #4593
4593: Stop crashing when panic occurs in thread pool r=ManyTheFish a=Kerollmops

This PR fixes #4362 by introducing a new boolean to catch panics in the rayon thread pool. The boolean is read after performing the operations in rayon, and the indexation process is stopped. This first version doesn't expose the panic message but marks the task as failed.

The current implementation exposes a `ThreadPoolNoAbort` wrapper. The `rayon::ThreadPool` has been wrapped to check that nothing went wrong after running the `ThreadPool::install` function. An atomic boolean and some `store/load` logic make the system work efficiently.

Before, Meilisearch was completely crashing...

<img width="1563" alt="Capture d’écran 2024-04-22 à 15 49 02" src="https://github.com/meilisearch/meilisearch/assets/3610253/ce114917-a881-4fbb-85df-c195fcf0c7cb">

Now, it handles the panics correctly and marks the task as failed.

<img width="1558" alt="Capture d’écran 2024-04-22 à 15 42 14" src="https://github.com/meilisearch/meilisearch/assets/3610253/8bd031ef-5e8f-4a12-a91e-c823597a2344">


Co-authored-by: Clément Renault <clement@meilisearch.com>
2024-04-24 16:27:08 +00:00
7468c1cf8d Introduce WildcardSetting that are serialized as wildcards by default 2024-04-24 18:15:03 +02:00
d4aeff92d0 Introduce the ThreadPoolNoAbort wrapper 2024-04-24 16:40:12 +02:00
e87cb373de Avoid intermediate serializing when displaying settings 2024-04-24 12:33:07 +02:00
9b76501875 Display set API key for Ollama embedder 2024-04-24 12:33:07 +02:00
b3173d0423 Remove useless dots in the error messages 2024-04-22 18:09:33 +02:00
96cc5319c8 Introduce a new internal error type to categorize panics 2024-04-22 18:09:33 +02:00
0c7003c5df Introduce an atomic to catch panics in thread pools 2024-04-22 18:09:33 +02:00
26 changed files with 259 additions and 108 deletions

4
Cargo.lock generated
View File

@ -889,9 +889,9 @@ dependencies = [
[[package]]
name = "charabia"
version = "0.8.9"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6a65052f308636e5d5e1777f0dbc07919f5fbac24b6c8ad3e140472e5520de9"
checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4"
dependencies = [
"aho-corasick",
"cow-utils",

View File

@ -256,8 +256,8 @@ pub(crate) mod test {
pub fn create_test_settings() -> Settings<Checked> {
let settings = Settings {
displayed_attributes: Setting::Set(vec![S("race"), S("name")]),
searchable_attributes: Setting::Set(vec![S("name"), S("race")]),
displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(),
searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(),
filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }),
sortable_attributes: Setting::Set(btreeset! { S("age") }),
ranking_rules: Setting::NotSet,

View File

@ -315,8 +315,8 @@ impl From<v5::ResponseError> for v6::ResponseError {
impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
fn from(settings: v5::Settings<T>) -> Self {
v6::Settings {
displayed_attributes: settings.displayed_attributes.into(),
searchable_attributes: settings.searchable_attributes.into(),
displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(),
searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(),
filterable_attributes: settings.filterable_attributes.into(),
sortable_attributes: settings.sortable_attributes.into(),
ranking_rules: {

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,]

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued []

View File

@ -57,3 +57,5 @@ greek = ["milli/greek"]
khmer = ["milli/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["milli/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["milli/swedish-recomposition"]

View File

@ -3,7 +3,7 @@ use std::convert::Infallible;
use std::fmt;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::ops::{ControlFlow, Deref};
use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
@ -143,21 +143,13 @@ impl MergeWithError<milli::CriterionError> for DeserrJsonError<InvalidSettingsRa
)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct Settings<T> {
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)]
pub displayed_attributes: Setting<Vec<String>>,
pub displayed_attributes: WildcardSetting,
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)]
pub searchable_attributes: Setting<Vec<String>>,
pub searchable_attributes: WildcardSetting,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)]
@ -251,8 +243,8 @@ impl<T> Settings<T> {
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
displayed_attributes: Setting::Reset,
searchable_attributes: Setting::Reset,
displayed_attributes: Setting::Reset.into(),
searchable_attributes: Setting::Reset.into(),
filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset,
ranking_rules: Setting::Reset,
@ -319,7 +311,7 @@ impl Settings<Checked> {
impl Settings<Unchecked> {
pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes {
let displayed_attributes = match self.displayed_attributes.0 {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
@ -330,7 +322,7 @@ impl Settings<Unchecked> {
otherwise => otherwise,
};
let searchable_attributes = match self.searchable_attributes {
let searchable_attributes = match self.searchable_attributes.0 {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
@ -342,8 +334,8 @@ impl Settings<Unchecked> {
};
Settings {
displayed_attributes,
searchable_attributes,
displayed_attributes: displayed_attributes.into(),
searchable_attributes: searchable_attributes.into(),
filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes,
ranking_rules: self.ranking_rules,
@ -412,13 +404,13 @@ pub fn apply_settings_to_builder(
_kind,
} = settings;
match searchable_attributes {
match searchable_attributes.deref() {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (),
}
match displayed_attributes {
match displayed_attributes.deref() {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (),
@ -690,11 +682,13 @@ pub fn settings(
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
}
.into(),
searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
}
.into(),
filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
@ -848,6 +842,41 @@ impl From<ProximityPrecisionView> for ProximityPrecision {
}
}
#[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)]
pub struct WildcardSetting(Setting<Vec<String>>);
impl From<Setting<Vec<String>>> for WildcardSetting {
fn from(setting: Setting<Vec<String>>) -> Self {
Self(setting)
}
}
impl Serialize for WildcardSetting {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serialize_with_wildcard(&self.0, serializer)
}
}
impl<E: deserr::DeserializeError> Deserr<E> for WildcardSetting {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: ValuePointerRef<'_>,
) -> Result<Self, E> {
Ok(Self(Setting::deserialize_from_value(value, location)?))
}
}
impl std::ops::Deref for WildcardSetting {
type Target = Setting<Vec<String>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[cfg(test)]
pub(crate) mod test {
use super::*;
@ -856,8 +885,8 @@ pub(crate) mod test {
fn test_setting_check() {
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("hello")]),
searchable_attributes: Setting::Set(vec![String::from("hello")]),
displayed_attributes: Setting::Set(vec![String::from("hello")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello")]).into(),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
@ -883,8 +912,9 @@ pub(crate) mod test {
// test wildcard
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("*")]),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
displayed_attributes: Setting::Set(vec![String::from("*")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")])
.into(),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
@ -904,7 +934,7 @@ pub(crate) mod test {
};
let checked = settings.check();
assert_eq!(checked.displayed_attributes, Setting::Reset);
assert_eq!(checked.searchable_attributes, Setting::Reset);
assert_eq!(checked.displayed_attributes, Setting::Reset.into());
assert_eq!(checked.searchable_attributes, Setting::Reset.into());
}
}

View File

@ -156,6 +156,7 @@ thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"

View File

@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError};
use clap::Parser;
use meilisearch_types::features::InstanceTogglableFeatures;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::ThreadPoolNoAbortBuilder;
use rustls::server::{
AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache,
};
@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = rayon::ThreadPoolBuilder::new()
let thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("indexing-thread:{index}"))
.num_threads(*other.max_indexing_threads)
.build()?;

View File

@ -137,10 +137,8 @@ macro_rules! make_setting_route {
let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?;
debug!(returns = ?settings, "Update settings");
let mut json = serde_json::json!(&settings);
let val = json[$camelcase_attr].take();
Ok(HttpResponse::Ok().json(val))
Ok(HttpResponse::Ok().json(settings.$attr))
}
pub fn resources() -> Resource {

View File

@ -367,12 +367,6 @@ async fn get_version(
})
}
#[derive(Serialize)]
struct KeysResponse {
private: Option<String>,
public: Option<String>,
}
pub async fn get_health(
index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>,

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.9.0"
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0"
charabia = { version = "0.8.9", default-features = false }
charabia = { version = "0.8.10", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.11"
deserr = "0.6.1"
@ -136,7 +136,11 @@ greek = ["charabia/greek"]
# allow khmer specialized tokenization
khmer = ["charabia/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["charabia/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["charabia/swedish-recomposition"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"]

View File

@ -9,6 +9,7 @@ use serde_json::Value;
use thiserror::Error;
use crate::documents::{self, DocumentsBatchCursorError};
use crate::thread_pool_no_abort::PanicCatched;
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
pub fn is_reserved_keyword(keyword: &str) -> bool {
@ -39,17 +40,19 @@ pub enum InternalError {
Fst(#[from] fst::Error),
#[error(transparent)]
DocumentsError(#[from] documents::Error),
#[error("Invalid compression type have been specified to grenad.")]
#[error("Invalid compression type have been specified to grenad")]
GrenadInvalidCompressionType,
#[error("Invalid grenad file with an invalid version format.")]
#[error("Invalid grenad file with an invalid version format")]
GrenadInvalidFormatVersion,
#[error("Invalid merge while processing {process}.")]
#[error("Invalid merge while processing {process}")]
IndexingMergingKeys { process: &'static str },
#[error("{}", HeedError::InvalidDatabaseTyping)]
InvalidDatabaseTyping,
#[error(transparent)]
RayonThreadPool(#[from] ThreadPoolBuildError),
#[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
#[error(transparent)]
SerdeJson(#[from] serde_json::Error),
#[error(transparent)]
Serialization(#[from] SerializationError),
@ -57,9 +60,9 @@ pub enum InternalError {
Store(#[from] MdbError),
#[error(transparent)]
Utf8(#[from] str::Utf8Error),
#[error("An indexation process was explicitly aborted.")]
#[error("An indexation process was explicitly aborted")]
AbortedIndexation,
#[error("The matching words list contains at least one invalid member.")]
#[error("The matching words list contains at least one invalid member")]
InvalidMatchingWords,
#[error(transparent)]
ArroyError(#[from] arroy::Error),

View File

@ -21,6 +21,7 @@ pub mod prompt;
pub mod proximity;
pub mod score_details;
mod search;
mod thread_pool_no_abort;
pub mod update;
pub mod vector;
@ -42,6 +43,7 @@ pub use search::new::{
SearchLogger, VisualSearchLogger,
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};

View File

@ -0,0 +1,69 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use rayon::{ThreadPool, ThreadPoolBuilder};
use thiserror::Error;
/// A rayon ThreadPool wrapper that can catch panics in the pool
/// and modifies the install function accordingly.
#[derive(Debug)]
pub struct ThreadPoolNoAbort {
thread_pool: ThreadPool,
/// Set to true if the thread pool catched a panic.
pool_catched_panic: Arc<AtomicBool>,
}
impl ThreadPoolNoAbort {
pub fn install<OP, R>(&self, op: OP) -> Result<R, PanicCatched>
where
OP: FnOnce() -> R + Send,
R: Send,
{
let output = self.thread_pool.install(op);
// While reseting the pool panic catcher we return an error if we catched one.
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
Err(PanicCatched)
} else {
Ok(output)
}
}
pub fn current_num_threads(&self) -> usize {
self.thread_pool.current_num_threads()
}
}
#[derive(Error, Debug)]
#[error("A panic occured. Read the logs to find more information about it")]
pub struct PanicCatched;
#[derive(Default)]
pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder);
impl ThreadPoolNoAbortBuilder {
pub fn new() -> ThreadPoolNoAbortBuilder {
ThreadPoolNoAbortBuilder::default()
}
pub fn thread_name<F>(mut self, closure: F) -> Self
where
F: FnMut(usize) -> String + 'static,
{
self.0 = self.0.thread_name(closure);
self
}
pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder {
self.0 = self.0.num_threads(num_threads);
self
}
pub fn build(mut self) -> Result<ThreadPoolNoAbort, rayon::ThreadPoolBuildError> {
let pool_catched_panic = Arc::new(AtomicBool::new(false));
self.0 = self.0.panic_handler({
let catched_panic = pool_catched_panic.clone();
move |_result| catched_panic.store(true, Ordering::SeqCst)
});
Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic })
}
}

View File

@ -19,7 +19,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::Embedder;
use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -362,7 +362,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
prompt_reader: grenad::Reader<R>,
indexer: GrenadParameters,
embedder: Arc<Embedder>,
request_threads: &rayon::ThreadPool,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism

View File

@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result};
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@ -229,12 +229,15 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = rayon::ThreadPoolBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let new_embedding_configs = settings_diff.new.embedding_configs.clone();
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
if (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
&& new_embedding_configs.get_default().is_some()
{
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {

View File

@ -33,6 +33,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
@ -298,18 +299,18 @@ where
let backup_pool;
let pool = match self.indexer_config.thread_pool {
Some(ref pool) => pool,
#[cfg(not(test))]
None => {
// We initialize a bakcup pool with the default
// We initialize a backup pool with the default
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool
}
#[cfg(test)]
None => {
// We initialize a bakcup pool with the default
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
#[allow(unused_mut)]
let mut pool_builder = ThreadPoolNoAbortBuilder::new();
#[cfg(test)]
{
pool_builder = pool_builder.num_threads(1);
}
backup_pool = pool_builder.build()?;
&backup_pool
}
};
@ -533,7 +534,7 @@ where
}
Ok(())
})?;
}).map_err(InternalError::from)??;
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@ -562,7 +563,8 @@ where
writer.build(wtxn, &mut rng, None)?;
}
Result::Ok(())
})?;
})
.map_err(InternalError::from)??;
}
self.execute_prefix_databases(

View File

@ -1,5 +1,6 @@
use grenad::CompressionType;
use rayon::ThreadPool;
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
#[derive(Debug)]
pub struct IndexerConfig {
@ -9,7 +10,7 @@ pub struct IndexerConfig {
pub max_memory: Option<usize>,
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>,
pub thread_pool: Option<ThreadPool>,
pub thread_pool: Option<ThreadPoolNoAbort>,
pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool,
}

View File

@ -3,6 +3,7 @@ use std::path::PathBuf;
use hf_hub::api::sync::ApiError;
use crate::error::FaultSource;
use crate::PanicCatched;
#[derive(Debug, thiserror::Error)]
#[error("Error while generating embeddings: {inner}")]
@ -80,6 +81,8 @@ pub enum EmbedErrorKind {
OpenAiUnexpectedDimension(usize, usize),
#[error("no embedding was produced")]
MissingEmbedding,
#[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
}
impl EmbedError {

View File

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
use self::error::{EmbedError, NewEmbedderError};
use crate::prompt::{Prompt, PromptData};
use crate::ThreadPoolNoAbort;
pub mod error;
pub mod hf;
@ -254,7 +255,7 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
match self {
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),

View File

@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
#[derive(Debug)]
pub struct Embedder {
@ -71,11 +73,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
use super::error::{EmbedError, NewEmbedderError};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::vector::error::EmbedErrorKind;
use crate::ThreadPoolNoAbort;
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
@ -241,11 +243,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -2,9 +2,12 @@ use deserr::Deserr;
use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use serde::{Deserialize, Serialize};
use super::error::EmbedErrorKind;
use super::{
DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM,
};
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
// retrying in case of failure
@ -158,11 +161,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
fn from(value: EmbeddingConfig) -> Self {
let EmbeddingConfig { embedder_options, prompt } = value;
match embedder_options {
super::EmbedderOptions::HuggingFace(options) => Self {
super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions {
model,
revision,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::HuggingFace),
model: Setting::Set(options.model),
revision: options.revision.map(Setting::Set).unwrap_or_default(),
model: Setting::Set(model),
revision: revision.map(Setting::Set).unwrap_or_default(),
api_key: Setting::NotSet,
dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template),
@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::OpenAi(options) => Self {
super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions {
api_key,
embedding_model,
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::OpenAi),
model: Setting::Set(options.embedding_model.name().to_owned()),
model: Setting::Set(embedding_model.name().to_owned()),
revision: Setting::NotSet,
api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(),
api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: dimensions.map(Setting::Set).unwrap_or_default(),
document_template: Setting::Set(prompt.template),
url: Setting::NotSet,
query: Setting::NotSet,
@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::Ollama(options) => Self {
super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions {
embedding_model,
url,
api_key,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::Ollama),
model: Setting::Set(options.embedding_model.to_owned()),
model: Setting::Set(embedding_model),
revision: Setting::NotSet,
api_key: Setting::NotSet,
api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template),
url: Setting::NotSet,
url: url.map(Setting::Set).unwrap_or_default(),
query: Setting::NotSet,
input_field: Setting::NotSet,
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::UserProvided(options) => Self {
super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::UserProvided),
model: Setting::NotSet,
revision: Setting::NotSet,
api_key: Setting::NotSet,
dimensions: Setting::Set(options.dimensions),
dimensions: Setting::Set(dimensions),
document_template: Setting::NotSet,
url: Setting::NotSet,
query: Setting::NotSet,
@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::Rest(super::rest::EmbedderOptions {
api_key,

View File

@ -217,9 +217,7 @@ fn add_memory_samples(
memory_counters: &mut Option<MemoryCounterHandles>,
last_memory: &mut MemoryStats,
) -> Option<MemoryStats> {
let Some(stats) = memory else {
return None;
};
let stats = memory?;
let memory_counters =
memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main));