Compare commits

..

26 Commits

Author SHA1 Message Date
b3952e8b3d Only spawn request threads if necessary 2024-06-18 15:34:39 +02:00
c668043c4f Merge #4617
4617: Destructure `EmbedderOptions` so we don't miss some options r=dureuill a=dureuill

# Pull Request

## Related issue
#4595 was caused by the code not destructuring the embedder options.


## What does this PR do?
This PR adds the missing `url` parameter for ollama, and makes sure similar issue cannot happen in the future



Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-05-02 14:55:32 +00:00
5a305bfdea Remove unused struct 2024-05-02 16:14:37 +02:00
f4dd73ec8c Destructure EmbedderOptions so we don't miss some options 2024-05-02 15:39:36 +02:00
66dce4600d Merge #4603
4603: Update charabia v0.8.10 r=Kerollmops a=ManyTheFish

- Update Charabia v0.8.10
- Add `swedish-recomposition` as an optional feature flag

Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-30 13:04:02 +00:00
fe51ceca6d Update lock file 2024-04-30 14:33:37 +02:00
88174b8ae4 Update charabia v0.8.10 2024-04-30 14:30:23 +02:00
ebca29f3de Merge #4597
4597: Fix embeddings settings update r=ManyTheFish a=ManyTheFish

# Pull Request
- add some conditions reducing the work done when changing the settings
- add some benchmarks on embedders

## Related issue
Fixes #4585


Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-25 16:37:28 +00:00
c793b6ef6d Merge #4600
4600: Fix embedders api r=ManyTheFish a=ManyTheFish

# Pull Request

## Related issue
Fixes #4594
Fixes #4595


Co-authored-by: ManyTheFish <many@meilisearch.com>
2024-04-25 13:16:33 +00:00
cbbfff3594 Remove debuging prints 2024-04-25 10:37:18 +02:00
dbcf50589b Fix clippy 2024-04-25 10:36:10 +02:00
3e5cd027a5 Merge #4593
4593: Stop crashing when panic occurs in thread pool r=ManyTheFish a=Kerollmops

This PR fixes #4362 by introducing a new boolean to catch panics in the rayon thread pool. The boolean is read after performing the operations in rayon, and the indexation process is stopped. This first version doesn't expose the panic message but marks the task as failed.

The current implementation exposes a `ThreadPoolNoAbort` wrapper. The `rayon::ThreadPool` has been wrapped to check that nothing went wrong after running the `ThreadPool::install` function. An atomic boolean and some `store/load` logic make the system work efficiently.

Before, Meilisearch was completely crashing...

<img width="1563" alt="Capture d’écran 2024-04-22 à 15 49 02" src="https://github.com/meilisearch/meilisearch/assets/3610253/ce114917-a881-4fbb-85df-c195fcf0c7cb">

Now, it handles the panics correctly and marks the task as failed.

<img width="1558" alt="Capture d’écran 2024-04-22 à 15 42 14" src="https://github.com/meilisearch/meilisearch/assets/3610253/8bd031ef-5e8f-4a12-a91e-c823597a2344">


Co-authored-by: Clément Renault <clement@meilisearch.com>
2024-04-24 16:27:08 +00:00
7468c1cf8d Introduce WildcardSetting that are serialized as wildcards by default 2024-04-24 18:15:03 +02:00
d4aeff92d0 Introduce the ThreadPoolNoAbort wrapper 2024-04-24 16:40:12 +02:00
e87cb373de Avoid intermediate serializing when displaying settings 2024-04-24 12:33:07 +02:00
9b76501875 Display set API key for Ollama embedder 2024-04-24 12:33:07 +02:00
6247e95dc3 Add benchmark for embeddings 2024-04-23 17:42:20 +02:00
b3173d0423 Remove useless dots in the error messages 2024-04-22 18:09:33 +02:00
96cc5319c8 Introduce a new internal error type to categorize panics 2024-04-22 18:09:33 +02:00
0c7003c5df Introduce an atomic to catch panics in thread pools 2024-04-22 18:09:33 +02:00
a1aa999026 Add conditions reducing wrok 2024-04-22 14:18:35 +02:00
aa0bbbb246 Merge #4578
4578: Remove useless analytics r=ManyTheFish a=irevoire

# Pull Request

## Related issue
Fixes #4577

## What does this PR do?
Remove the following analytics:
- `Health Seen`
- `Stats Seen`
- `Task Seen`
- `Version Seen`


Co-authored-by: Tamo <tamo@meilisearch.com>
2024-04-18 13:30:42 +00:00
2dd9dd6d0a remove the Health Seen analytic 2024-04-17 11:43:40 +02:00
e1f27de51a remove the Stats Seen analytic 2024-04-16 18:49:41 +02:00
abae31aee0 remove the Task Seen analytic 2024-04-16 18:48:10 +02:00
70ce0095ea remove the Version Seen analytic 2024-04-16 18:48:03 +02:00
33 changed files with 425 additions and 351 deletions

4
Cargo.lock generated
View File

@ -889,9 +889,9 @@ dependencies = [
[[package]]
name = "charabia"
version = "0.8.9"
version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6a65052f308636e5d5e1777f0dbc07919f5fbac24b6c8ad3e140472e5520de9"
checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4"
dependencies = [
"aho-corasick",
"cow-utils",

View File

@ -256,8 +256,8 @@ pub(crate) mod test {
pub fn create_test_settings() -> Settings<Checked> {
let settings = Settings {
displayed_attributes: Setting::Set(vec![S("race"), S("name")]),
searchable_attributes: Setting::Set(vec![S("name"), S("race")]),
displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(),
searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(),
filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }),
sortable_attributes: Setting::Set(btreeset! { S("age") }),
ranking_rules: Setting::NotSet,

View File

@ -315,8 +315,8 @@ impl From<v5::ResponseError> for v6::ResponseError {
impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
fn from(settings: v5::Settings<T>) -> Self {
v6::Settings {
displayed_attributes: settings.displayed_attributes.into(),
searchable_attributes: settings.searchable_attributes.into(),
displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(),
searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(),
filterable_attributes: settings.filterable_attributes.into(),
sortable_attributes: settings.sortable_attributes.into(),
ranking_rules: {

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,]

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued []

View File

@ -57,3 +57,5 @@ greek = ["milli/greek"]
khmer = ["milli/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["milli/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["milli/swedish-recomposition"]

View File

@ -3,7 +3,7 @@ use std::convert::Infallible;
use std::fmt;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::ops::ControlFlow;
use std::ops::{ControlFlow, Deref};
use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
@ -143,21 +143,13 @@ impl MergeWithError<milli::CriterionError> for DeserrJsonError<InvalidSettingsRa
)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct Settings<T> {
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)]
pub displayed_attributes: Setting<Vec<String>>,
pub displayed_attributes: WildcardSetting,
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)]
pub searchable_attributes: Setting<Vec<String>>,
pub searchable_attributes: WildcardSetting,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)]
@ -251,8 +243,8 @@ impl<T> Settings<T> {
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
displayed_attributes: Setting::Reset,
searchable_attributes: Setting::Reset,
displayed_attributes: Setting::Reset.into(),
searchable_attributes: Setting::Reset.into(),
filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset,
ranking_rules: Setting::Reset,
@ -319,7 +311,7 @@ impl Settings<Checked> {
impl Settings<Unchecked> {
pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes {
let displayed_attributes = match self.displayed_attributes.0 {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
@ -330,7 +322,7 @@ impl Settings<Unchecked> {
otherwise => otherwise,
};
let searchable_attributes = match self.searchable_attributes {
let searchable_attributes = match self.searchable_attributes.0 {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
@ -342,8 +334,8 @@ impl Settings<Unchecked> {
};
Settings {
displayed_attributes,
searchable_attributes,
displayed_attributes: displayed_attributes.into(),
searchable_attributes: searchable_attributes.into(),
filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes,
ranking_rules: self.ranking_rules,
@ -412,13 +404,13 @@ pub fn apply_settings_to_builder(
_kind,
} = settings;
match searchable_attributes {
match searchable_attributes.deref() {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (),
}
match displayed_attributes {
match displayed_attributes.deref() {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (),
@ -690,11 +682,13 @@ pub fn settings(
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
}
.into(),
searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
}
.into(),
filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
@ -848,6 +842,41 @@ impl From<ProximityPrecisionView> for ProximityPrecision {
}
}
#[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)]
pub struct WildcardSetting(Setting<Vec<String>>);
impl From<Setting<Vec<String>>> for WildcardSetting {
fn from(setting: Setting<Vec<String>>) -> Self {
Self(setting)
}
}
impl Serialize for WildcardSetting {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serialize_with_wildcard(&self.0, serializer)
}
}
impl<E: deserr::DeserializeError> Deserr<E> for WildcardSetting {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: ValuePointerRef<'_>,
) -> Result<Self, E> {
Ok(Self(Setting::deserialize_from_value(value, location)?))
}
}
impl std::ops::Deref for WildcardSetting {
type Target = Setting<Vec<String>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[cfg(test)]
pub(crate) mod test {
use super::*;
@ -856,8 +885,8 @@ pub(crate) mod test {
fn test_setting_check() {
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("hello")]),
searchable_attributes: Setting::Set(vec![String::from("hello")]),
displayed_attributes: Setting::Set(vec![String::from("hello")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello")]).into(),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
@ -883,8 +912,9 @@ pub(crate) mod test {
// test wildcard
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("*")]),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
displayed_attributes: Setting::Set(vec![String::from("*")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")])
.into(),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
@ -904,7 +934,7 @@ pub(crate) mod test {
};
let checked = settings.check();
assert_eq!(checked.displayed_attributes, Setting::Reset);
assert_eq!(checked.searchable_attributes, Setting::Reset);
assert_eq!(checked.displayed_attributes, Setting::Reset.into());
assert_eq!(checked.searchable_attributes, Setting::Reset.into());
}
}

View File

@ -156,6 +156,7 @@ thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"

View File

@ -7,7 +7,6 @@ use serde_json::Value;
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::Opt;
pub struct MockAnalytics {
@ -86,6 +85,4 @@ impl Analytics for MockAnalytics {
}
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {}
fn health_seen(&self, _request: &HttpRequest) {}
}

View File

@ -14,7 +14,6 @@ use platform_dirs::AppDirs;
use serde_json::Value;
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
// if the analytics feature is disabled
// the `SegmentAnalytics` point to the mock instead of the real analytics
@ -117,10 +116,4 @@ pub trait Analytics: Sync + Send {
index_creation: bool,
request: &HttpRequest,
);
// this method should be called to aggregate the get tasks requests.
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest);
// this method should be called to aggregate a add documents request
fn health_seen(&self, request: &HttpRequest);
}

View File

@ -33,7 +33,6 @@ use crate::option::{
};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::indexes::facet_search::FacetSearchQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::routes::{create_all_stats, Stats};
use crate::search::{
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
@ -81,8 +80,6 @@ pub enum AnalyticsMsg {
AggregateUpdateDocuments(DocumentsAggregator),
AggregateGetFetchDocuments(DocumentsFetchAggregator),
AggregatePostFetchDocuments(DocumentsFetchAggregator),
AggregateTasks(TasksAggregator),
AggregateHealth(HealthAggregator),
}
pub struct SegmentAnalytics {
@ -152,8 +149,6 @@ impl SegmentAnalytics {
update_documents_aggregator: DocumentsAggregator::default(),
get_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
post_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
get_tasks_aggregator: TasksAggregator::default(),
health_aggregator: HealthAggregator::default(),
});
tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone()));
@ -231,16 +226,6 @@ impl super::Analytics for SegmentAnalytics {
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate));
}
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) {
let aggregate = TasksAggregator::from_query(query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate));
}
fn health_seen(&self, request: &HttpRequest) {
let aggregate = HealthAggregator::from_query(request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate));
}
}
/// This structure represent the `infos` field we send in the analytics.
@ -394,8 +379,6 @@ pub struct Segment {
update_documents_aggregator: DocumentsAggregator,
get_fetch_documents_aggregator: DocumentsFetchAggregator,
post_fetch_documents_aggregator: DocumentsFetchAggregator,
get_tasks_aggregator: TasksAggregator,
health_aggregator: HealthAggregator,
}
impl Segment {
@ -458,8 +441,6 @@ impl Segment {
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg),
None => (),
}
}
@ -513,8 +494,6 @@ impl Segment {
update_documents_aggregator,
get_fetch_documents_aggregator,
post_fetch_documents_aggregator,
get_tasks_aggregator,
health_aggregator,
} = self;
if let Some(get_search) =
@ -562,12 +541,6 @@ impl Segment {
{
let _ = self.batcher.push(post_fetch_documents).await;
}
if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") {
let _ = self.batcher.push(get_tasks).await;
}
if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") {
let _ = self.batcher.push(health).await;
}
let _ = self.batcher.flush().await;
}
}
@ -1503,176 +1476,6 @@ impl DocumentsDeletionAggregator {
}
}
#[derive(Default, Serialize)]
pub struct TasksAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
filtered_by_uid: bool,
filtered_by_index_uid: bool,
filtered_by_type: bool,
filtered_by_status: bool,
filtered_by_canceled_by: bool,
filtered_by_before_enqueued_at: bool,
filtered_by_after_enqueued_at: bool,
filtered_by_before_started_at: bool,
filtered_by_after_started_at: bool,
filtered_by_before_finished_at: bool,
filtered_by_after_finished_at: bool,
total_received: usize,
}
impl TasksAggregator {
pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self {
let TasksFilterQuery {
limit: _,
from: _,
uids,
index_uids,
types,
statuses,
canceled_by,
before_enqueued_at,
after_enqueued_at,
before_started_at,
after_started_at,
before_finished_at,
after_finished_at,
} = query;
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
filtered_by_uid: uids.is_some(),
filtered_by_index_uid: index_uids.is_some(),
filtered_by_type: types.is_some(),
filtered_by_status: statuses.is_some(),
filtered_by_canceled_by: canceled_by.is_some(),
filtered_by_before_enqueued_at: before_enqueued_at.is_some(),
filtered_by_after_enqueued_at: after_enqueued_at.is_some(),
filtered_by_before_started_at: before_started_at.is_some(),
filtered_by_after_started_at: after_started_at.is_some(),
filtered_by_before_finished_at: before_finished_at.is_some(),
filtered_by_after_finished_at: after_finished_at.is_some(),
total_received: 1,
}
}
/// Aggregate one [TasksAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self {
timestamp,
user_agents,
total_received,
filtered_by_uid,
filtered_by_index_uid,
filtered_by_type,
filtered_by_status,
filtered_by_canceled_by,
filtered_by_before_enqueued_at,
filtered_by_after_enqueued_at,
filtered_by_before_started_at,
filtered_by_after_started_at,
filtered_by_before_finished_at,
filtered_by_after_finished_at,
} = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.filtered_by_uid |= filtered_by_uid;
self.filtered_by_index_uid |= filtered_by_index_uid;
self.filtered_by_type |= filtered_by_type;
self.filtered_by_status |= filtered_by_status;
self.filtered_by_canceled_by |= filtered_by_canceled_by;
self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at;
self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at;
self.filtered_by_before_started_at |= filtered_by_before_started_at;
self.filtered_by_after_started_at |= filtered_by_after_started_at;
self.filtered_by_before_finished_at |= filtered_by_before_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)]
pub struct HealthAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
#[serde(rename = "requests.total_received")]
total_received: usize,
}
impl HealthAggregator {
pub fn from_query(request: &HttpRequest) -> Self {
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1,
}
}
/// Aggregate one [HealthAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self { timestamp, user_agents, total_received } = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)]
pub struct DocumentsFetchAggregator {
#[serde(skip)]

View File

@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError};
use clap::Parser;
use meilisearch_types::features::InstanceTogglableFeatures;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::ThreadPoolNoAbortBuilder;
use rustls::server::{
AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache,
};
@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = rayon::ThreadPoolBuilder::new()
let thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("indexing-thread:{index}"))
.num_threads(*other.max_indexing_threads)
.build()?;

View File

@ -269,12 +269,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
pub async fn get_index_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req));
let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?);
debug!(returns = ?stats, "Get index stats");

View File

@ -137,10 +137,8 @@ macro_rules! make_setting_route {
let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?;
debug!(returns = ?settings, "Update settings");
let mut json = serde_json::json!(&settings);
let val = json[$camelcase_attr].take();
Ok(HttpResponse::Ok().json(val))
Ok(HttpResponse::Ok().json(settings.$attr))
}
pub fn resources() -> Resource {

View File

@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::settings::{Settings, Unchecked};
use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
use serde::{Deserialize, Serialize};
use serde_json::json;
use time::OffsetDateTime;
use tracing::debug;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::search_queue::SearchQueue;
@ -296,10 +294,7 @@ pub struct Stats {
async fn get_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req));
let filters = index_scheduler.filters();
let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?;
@ -355,11 +350,7 @@ struct VersionResponse {
async fn get_version(
_index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> HttpResponse {
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
let build_info = build_info::BuildInfo::from_build();
HttpResponse::Ok().json(VersionResponse {
@ -376,21 +367,11 @@ async fn get_version(
})
}
#[derive(Serialize)]
struct KeysResponse {
private: Option<String>,
public: Option<String>,
}
pub async fn get_health(
req: HttpRequest,
index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>,
search_queue: Data<SearchQueue>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.health_seen(&req);
search_queue.health().unwrap();
index_scheduler.health().unwrap();
auth_controller.health().unwrap();

View File

@ -270,12 +270,8 @@ pub struct AllTasks {
async fn get_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let mut params = params.into_inner();
analytics.get_tasks(&params, &req);
// We +1 just to know if there is more after this "page" or not.
params.limit.0 = params.limit.0.saturating_add(1);
let limit = params.limit.0;
@ -298,8 +294,6 @@ async fn get_tasks(
async fn get_task(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
task_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let task_uid_string = task_uid.into_inner();
@ -310,8 +304,6 @@ async fn get_task(
}
};
analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req));
let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() };
let filters = index_scheduler.filters();
let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?;

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.9.0"
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0"
charabia = { version = "0.8.9", default-features = false }
charabia = { version = "0.8.10", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.11"
deserr = "0.6.1"
@ -136,7 +136,11 @@ greek = ["charabia/greek"]
# allow khmer specialized tokenization
khmer = ["charabia/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["charabia/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["charabia/swedish-recomposition"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"]

View File

@ -9,6 +9,7 @@ use serde_json::Value;
use thiserror::Error;
use crate::documents::{self, DocumentsBatchCursorError};
use crate::thread_pool_no_abort::PanicCatched;
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
pub fn is_reserved_keyword(keyword: &str) -> bool {
@ -39,17 +40,19 @@ pub enum InternalError {
Fst(#[from] fst::Error),
#[error(transparent)]
DocumentsError(#[from] documents::Error),
#[error("Invalid compression type have been specified to grenad.")]
#[error("Invalid compression type have been specified to grenad")]
GrenadInvalidCompressionType,
#[error("Invalid grenad file with an invalid version format.")]
#[error("Invalid grenad file with an invalid version format")]
GrenadInvalidFormatVersion,
#[error("Invalid merge while processing {process}.")]
#[error("Invalid merge while processing {process}")]
IndexingMergingKeys { process: &'static str },
#[error("{}", HeedError::InvalidDatabaseTyping)]
InvalidDatabaseTyping,
#[error(transparent)]
RayonThreadPool(#[from] ThreadPoolBuildError),
#[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
#[error(transparent)]
SerdeJson(#[from] serde_json::Error),
#[error(transparent)]
Serialization(#[from] SerializationError),
@ -57,9 +60,9 @@ pub enum InternalError {
Store(#[from] MdbError),
#[error(transparent)]
Utf8(#[from] str::Utf8Error),
#[error("An indexation process was explicitly aborted.")]
#[error("An indexation process was explicitly aborted")]
AbortedIndexation,
#[error("The matching words list contains at least one invalid member.")]
#[error("The matching words list contains at least one invalid member")]
InvalidMatchingWords,
#[error(transparent)]
ArroyError(#[from] arroy::Error),

View File

@ -21,6 +21,7 @@ pub mod prompt;
pub mod proximity;
pub mod score_details;
mod search;
mod thread_pool_no_abort;
pub mod update;
pub mod vector;
@ -42,6 +43,7 @@ pub use search::new::{
SearchLogger, VisualSearchLogger,
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};

View File

@ -0,0 +1,69 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use rayon::{ThreadPool, ThreadPoolBuilder};
use thiserror::Error;
/// A rayon ThreadPool wrapper that can catch panics in the pool
/// and modifies the install function accordingly.
#[derive(Debug)]
pub struct ThreadPoolNoAbort {
thread_pool: ThreadPool,
/// Set to true if the thread pool catched a panic.
pool_catched_panic: Arc<AtomicBool>,
}
impl ThreadPoolNoAbort {
pub fn install<OP, R>(&self, op: OP) -> Result<R, PanicCatched>
where
OP: FnOnce() -> R + Send,
R: Send,
{
let output = self.thread_pool.install(op);
// While reseting the pool panic catcher we return an error if we catched one.
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
Err(PanicCatched)
} else {
Ok(output)
}
}
pub fn current_num_threads(&self) -> usize {
self.thread_pool.current_num_threads()
}
}
#[derive(Error, Debug)]
#[error("A panic occured. Read the logs to find more information about it")]
pub struct PanicCatched;
#[derive(Default)]
pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder);
impl ThreadPoolNoAbortBuilder {
pub fn new() -> ThreadPoolNoAbortBuilder {
ThreadPoolNoAbortBuilder::default()
}
pub fn thread_name<F>(mut self, closure: F) -> Self
where
F: FnMut(usize) -> String + 'static,
{
self.0 = self.0.thread_name(closure);
self
}
pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder {
self.0 = self.0.num_threads(num_threads);
self
}
pub fn build(mut self) -> Result<ThreadPoolNoAbort, rayon::ThreadPoolBuildError> {
let pool_catched_panic = Arc::new(AtomicBool::new(false));
self.0 = self.0.panic_handler({
let catched_panic = pool_catched_panic.clone();
move |_result| catched_panic.store(true, Ordering::SeqCst)
});
Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic })
}
}

View File

@ -19,7 +19,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::Embedder;
use crate::{DocumentId, InternalError, Result, VectorOrArrayOfVectors};
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -198,11 +198,16 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = prompt
.render(obkv, DelAdd::Deletion, old_fields_ids_map)
.unwrap_or_default();
let old_prompt = Some(prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt != new_prompt {
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
@ -224,6 +229,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
&mut manual_vectors_writer,
&mut key_buffer,
delta,
settings_diff,
)?;
}
@ -264,10 +270,15 @@ fn push_vectors_diff(
manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>,
delta: VectorStateDelta,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<()> {
puffin::profile_function!();
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove {
if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !settings_diff.reindex_vectors()
{
key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?;
}
@ -295,12 +306,16 @@ fn push_vectors_diff(
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
if !settings_diff.reindex_vectors() {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform
@ -347,7 +362,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
prompt_reader: grenad::Reader<R>,
indexer: GrenadParameters,
embedder: Arc<Embedder>,
request_threads: &rayon::ThreadPool,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism

View File

@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result};
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@ -229,12 +229,15 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = rayon::ThreadPoolBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let new_embedding_configs = settings_diff.new.embedding_configs.clone();
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
if (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
&& new_embedding_configs.get_default().is_some()
{
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let settings_diff = settings_diff.clone();
rayon::spawn(move || {
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {

View File

@ -33,6 +33,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
@ -298,18 +299,18 @@ where
let backup_pool;
let pool = match self.indexer_config.thread_pool {
Some(ref pool) => pool,
#[cfg(not(test))]
None => {
// We initialize a bakcup pool with the default
// We initialize a backup pool with the default
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool
}
#[cfg(test)]
None => {
// We initialize a bakcup pool with the default
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
#[allow(unused_mut)]
let mut pool_builder = ThreadPoolNoAbortBuilder::new();
#[cfg(test)]
{
pool_builder = pool_builder.num_threads(1);
}
backup_pool = pool_builder.build()?;
&backup_pool
}
};
@ -533,7 +534,7 @@ where
}
Ok(())
})?;
}).map_err(InternalError::from)??;
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@ -562,7 +563,8 @@ where
writer.build(wtxn, &mut rng, None)?;
}
Result::Ok(())
})?;
})
.map_err(InternalError::from)??;
}
self.execute_prefix_databases(

View File

@ -1,5 +1,6 @@
use grenad::CompressionType;
use rayon::ThreadPool;
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
#[derive(Debug)]
pub struct IndexerConfig {
@ -9,7 +10,7 @@ pub struct IndexerConfig {
pub max_memory: Option<usize>,
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>,
pub thread_pool: Option<ThreadPool>,
pub thread_pool: Option<ThreadPoolNoAbort>,
pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool,
}

View File

@ -3,6 +3,7 @@ use std::path::PathBuf;
use hf_hub::api::sync::ApiError;
use crate::error::FaultSource;
use crate::PanicCatched;
#[derive(Debug, thiserror::Error)]
#[error("Error while generating embeddings: {inner}")]
@ -80,6 +81,8 @@ pub enum EmbedErrorKind {
OpenAiUnexpectedDimension(usize, usize),
#[error("no embedding was produced")]
MissingEmbedding,
#[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
}
impl EmbedError {

View File

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
use self::error::{EmbedError, NewEmbedderError};
use crate::prompt::{Prompt, PromptData};
use crate::ThreadPoolNoAbort;
pub mod error;
pub mod hf;
@ -254,7 +255,7 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
match self {
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),

View File

@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
#[derive(Debug)]
pub struct Embedder {
@ -71,11 +73,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
use super::error::{EmbedError, NewEmbedderError};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::vector::error::EmbedErrorKind;
use crate::ThreadPoolNoAbort;
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions {
@ -241,11 +243,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -2,9 +2,12 @@ use deserr::Deserr;
use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use serde::{Deserialize, Serialize};
use super::error::EmbedErrorKind;
use super::{
DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM,
};
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
// retrying in case of failure
@ -158,11 +161,16 @@ impl Embedder {
pub fn embed_chunks(
&self,
text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool,
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
fn from(value: EmbeddingConfig) -> Self {
let EmbeddingConfig { embedder_options, prompt } = value;
match embedder_options {
super::EmbedderOptions::HuggingFace(options) => Self {
super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions {
model,
revision,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::HuggingFace),
model: Setting::Set(options.model),
revision: options.revision.map(Setting::Set).unwrap_or_default(),
model: Setting::Set(model),
revision: revision.map(Setting::Set).unwrap_or_default(),
api_key: Setting::NotSet,
dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template),
@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::OpenAi(options) => Self {
super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions {
api_key,
embedding_model,
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::OpenAi),
model: Setting::Set(options.embedding_model.name().to_owned()),
model: Setting::Set(embedding_model.name().to_owned()),
revision: Setting::NotSet,
api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(),
api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: dimensions.map(Setting::Set).unwrap_or_default(),
document_template: Setting::Set(prompt.template),
url: Setting::NotSet,
query: Setting::NotSet,
@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::Ollama(options) => Self {
super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions {
embedding_model,
url,
api_key,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::Ollama),
model: Setting::Set(options.embedding_model.to_owned()),
model: Setting::Set(embedding_model),
revision: Setting::NotSet,
api_key: Setting::NotSet,
api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template),
url: Setting::NotSet,
url: url.map(Setting::Set).unwrap_or_default(),
query: Setting::NotSet,
input_field: Setting::NotSet,
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::UserProvided(options) => Self {
super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::UserProvided),
model: Setting::NotSet,
revision: Setting::NotSet,
api_key: Setting::NotSet,
dimensions: Setting::Set(options.dimensions),
dimensions: Setting::Set(dimensions),
document_template: Setting::NotSet,
url: Setting::NotSet,
query: Setting::NotSet,
@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet,
input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
distribution: distribution.map(Setting::Set).unwrap_or_default(),
},
super::EmbedderOptions::Rest(super::rest::EmbedderOptions {
api_key,

View File

@ -217,9 +217,7 @@ fn add_memory_samples(
memory_counters: &mut Option<MemoryCounterHandles>,
last_memory: &mut MemoryStats,
) -> Option<MemoryStats> {
let Some(stats) = memory else {
return None;
};
let stats = memory?;
let memory_counters =
memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main));

View File

@ -0,0 +1,68 @@
{
"name": "movies-subset-hf-embeddings",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace"
}
}
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -0,0 +1,72 @@
{
"name": "settings-add-embeddings-hf",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace",
"model": null,
"revision": null,
"documentTemplate": null,
"distribution": null
}
}
}
},
"synchronous": "WaitForTask"
}
]
}