From fb705116a6798035c199f48730a027c426933086 Mon Sep 17 00:00:00 2001 From: Gosti Date: Tue, 30 Jan 2024 16:32:57 +0100 Subject: [PATCH 1/8] feat: add new models and ability to override dimensions --- milli/src/vector/openai.rs | 54 ++++++++++++++++++++++++++++++++---- milli/src/vector/settings.rs | 3 ++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 524f83b80..20013d8e8 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -17,6 +17,7 @@ pub struct Embedder { pub struct EmbedderOptions { pub api_key: Option, pub embedding_model: EmbeddingModel, + pub dimensions: Option, } #[derive( @@ -41,34 +42,54 @@ pub enum EmbeddingModel { #[serde(rename = "text-embedding-ada-002")] #[deserr(rename = "text-embedding-ada-002")] TextEmbeddingAda002, + + #[serde(rename = "text-embedding-3-small")] + #[deserr(rename = "text-embedding-3-small")] + TextEmbedding3Small, + + #[serde(rename = "text-embedding-3-large")] + #[deserr(rename = "text-embedding-3-large")] + TextEmbedding3Large, } impl EmbeddingModel { pub fn supported_models() -> &'static [&'static str] { - &["text-embedding-ada-002"] + &["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"] } pub fn max_token(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 8191, + EmbeddingModel::TextEmbedding3Large => 8191, + EmbeddingModel::TextEmbedding3Small => 8191, } } pub fn dimensions(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 1536, + + //Default value for the model + EmbeddingModel::TextEmbedding3Large => 1536, + + //Default value for the model + EmbeddingModel::TextEmbedding3Small => 3072, } } pub fn name(&self) -> &'static str { match self { EmbeddingModel::TextEmbeddingAda002 => "text-embedding-ada-002", + EmbeddingModel::TextEmbedding3Large => "text-embedding-3-large", + EmbeddingModel::TextEmbedding3Small => "text-embedding-3-small", } } pub fn from_name(name: &str) -> Option { match name { "text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002), + "text-embedding-3-large" => Some(EmbeddingModel::TextEmbedding3Large), + "text-embedding-3-small" => Some(EmbeddingModel::TextEmbedding3Small), _ => None, } } @@ -78,6 +99,20 @@ impl EmbeddingModel { EmbeddingModel::TextEmbeddingAda002 => { Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) } + EmbeddingModel::TextEmbedding3Large => { + Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) + } + EmbeddingModel::TextEmbedding3Small => { + Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) + } + } + } + + pub fn is_optional_dimensions_supported(&self) -> bool { + match self { + EmbeddingModel::TextEmbeddingAda002 => false, + EmbeddingModel::TextEmbedding3Large => true, + EmbeddingModel::TextEmbedding3Small => true, } } } @@ -86,11 +121,11 @@ pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings"; impl EmbedderOptions { pub fn with_default_model(api_key: Option) -> Self { - Self { api_key, embedding_model: Default::default() } + Self { api_key, embedding_model: Default::default(), dimensions: None } } pub fn with_embedding_model(api_key: Option, embedding_model: EmbeddingModel) -> Self { - Self { api_key, embedding_model } + Self { api_key, embedding_model, dimensions: None } } } @@ -237,7 +272,15 @@ impl Embedder { for text in texts { log::trace!("Received prompt: {}", text.as_ref()) } - let request = OpenAiRequest { model: self.options.embedding_model.name(), input: texts }; + let request = OpenAiRequest { + model: self.options.embedding_model.name(), + input: texts, + dimension: if self.options.embedding_model.is_optional_dimensions_supported() { + self.options.dimensions.as_ref() + } else { + None + }, + }; let response = client .post(OPENAI_EMBEDDINGS_URL) .json(&request) @@ -366,7 +409,7 @@ impl Embedder { } pub fn dimensions(&self) -> usize { - self.options.embedding_model.dimensions() + self.options.dimensions.unwrap_or_else(|| self.options.embedding_model.dimensions()) } pub fn distribution(&self) -> Option { @@ -431,6 +474,7 @@ impl Retry { struct OpenAiRequest<'a, S: AsRef + serde::Serialize> { model: &'a str, input: &'a [S], + dimension: Option<&'a usize>, } #[derive(Debug, Serialize)] diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 37fb80452..dac129ccd 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -208,6 +208,9 @@ impl From for EmbeddingConfig { if let Some(api_key) = api_key.set() { options.api_key = Some(api_key); } + if let Some(dimensions) = dimensions.set() { + options.dimensions = Some(dimensions); + } this.embedder_options = super::EmbedderOptions::OpenAi(options); } EmbedderSource::HuggingFace => { From 7ae401347868f9ff7046fe68c44e4eb73c887a4f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 10:36:30 +0100 Subject: [PATCH 2/8] Make sure the overriden dimensions are always used when embedding --- milli/src/vector/openai.rs | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 20013d8e8..8712c7894 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -65,14 +65,10 @@ impl EmbeddingModel { } } - pub fn dimensions(&self) -> usize { + pub fn default_dimensions(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 1536, - - //Default value for the model EmbeddingModel::TextEmbedding3Large => 1536, - - //Default value for the model EmbeddingModel::TextEmbedding3Small => 3072, } } @@ -108,7 +104,7 @@ impl EmbeddingModel { } } - pub fn is_optional_dimensions_supported(&self) -> bool { + pub fn supports_overriding_dimensions(&self) -> bool { match self { EmbeddingModel::TextEmbeddingAda002 => false, EmbeddingModel::TextEmbedding3Large => true, @@ -275,7 +271,7 @@ impl Embedder { let request = OpenAiRequest { model: self.options.embedding_model.name(), input: texts, - dimension: if self.options.embedding_model.is_optional_dimensions_supported() { + dimension: if self.options.embedding_model.supports_overriding_dimensions() { self.options.dimensions.as_ref() } else { None @@ -323,8 +319,7 @@ impl Embedder { } let mut tokens = encoded.as_slice(); - let mut embeddings_for_prompt = - Embeddings::new(self.options.embedding_model.dimensions()); + let mut embeddings_for_prompt = Embeddings::new(self.dimensions()); while tokens.len() > max_token_count { let window = &tokens[..max_token_count]; embeddings_for_prompt.push(self.embed_tokens(window, client).await?).unwrap(); @@ -409,7 +404,11 @@ impl Embedder { } pub fn dimensions(&self) -> usize { - self.options.dimensions.unwrap_or_else(|| self.options.embedding_model.dimensions()) + if self.options.embedding_model.supports_overriding_dimensions() { + self.options.dimensions.unwrap_or(self.options.embedding_model.default_dimensions()) + } else { + self.options.embedding_model.default_dimensions() + } } pub fn distribution(&self) -> Option { From 9ac57500964450f8003e5ac9f70ae49e186a314d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 10:37:59 +0100 Subject: [PATCH 3/8] Retrieve the overriden dimensions from the configuration when fetching settings --- milli/src/vector/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index dac129ccd..6fe8eddc0 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -176,7 +176,7 @@ impl From for EmbeddingSettings { model: Setting::Set(options.embedding_model.name().to_owned()), revision: Setting::NotSet, api_key: options.api_key.map(Setting::Set).unwrap_or_default(), - dimensions: Setting::NotSet, + dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), document_template: Setting::Set(prompt.template), }, super::EmbedderOptions::UserProvided(options) => Self { From 517f5332d67b30f06056557daa4a8b32d43f9449 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 10:39:19 +0100 Subject: [PATCH 4/8] Allow actually passing `dimensions` for OpenAI source -> make sure the settings change is rejected or the settings task fails when the specified model doesn't support overriding `dimensions` and the passed `dimensions` differs from the model's default dimensions. --- meilisearch-types/src/error.rs | 1 + milli/src/error.rs | 7 +++++++ milli/src/update/settings.rs | 24 +++++++++++++++++++----- milli/src/vector/settings.rs | 24 ++++++++++++++++++++---- 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 2182b1836..796eb5713 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -347,6 +347,7 @@ impl ErrorCode for milli::Error { UserError::InvalidFieldForSource { .. } | UserError::MissingFieldForSource { .. } | UserError::InvalidOpenAiModel { .. } + | UserError::InvalidOpenAiModelDimensions { .. } | UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, diff --git a/milli/src/error.rs b/milli/src/error.rs index 5a4fbc7f5..9cb984db1 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -227,6 +227,13 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco source_: crate::vector::settings::EmbedderSource, embedder_name: String, }, + #[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its native dimensions of {expected_dimensions}. Found {dimensions}")] + InvalidOpenAiModelDimensions { + embedder_name: String, + model: &'static str, + dimensions: usize, + expected_dimensions: usize, + }, } impl From for Error { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index d770bcd74..b8289626b 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -974,6 +974,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { crate::vector::settings::EmbeddingSettings::apply_default_source( &mut setting, ); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); let setting = validate_embedding_settings(setting, &name)?; changed = true; new_configs.insert(name, setting); @@ -1132,14 +1135,25 @@ pub fn validate_embedding_settings( match inferred_source { EmbedderSource::OpenAi => { check_unset(&revision, "revision", inferred_source, name)?; - check_unset(&dimensions, "dimensions", inferred_source, name)?; if let Setting::Set(model) = &model { - crate::vector::openai::EmbeddingModel::from_name(model.as_str()).ok_or( - crate::error::UserError::InvalidOpenAiModel { + let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str()) + .ok_or(crate::error::UserError::InvalidOpenAiModel { embedder_name: name.to_owned(), model: model.clone(), - }, - )?; + })?; + if let Setting::Set(dimensions) = dimensions { + if !model.supports_overriding_dimensions() + && dimensions != model.default_dimensions() + { + return Err(crate::error::UserError::InvalidOpenAiModelDimensions { + embedder_name: name.to_owned(), + model: model.name(), + dimensions, + expected_dimensions: model.default_dimensions(), + } + .into()); + } + } } } EmbedderSource::HuggingFace => { diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 6fe8eddc0..834a1c81d 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,6 +1,7 @@ use deserr::Deserr; use serde::{Deserialize, Serialize}; +use super::openai; use crate::prompt::PromptData; use crate::update::Setting; use crate::vector::EmbeddingConfig; @@ -82,7 +83,7 @@ impl EmbeddingSettings { Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], Self::REVISION => &[EmbedderSource::HuggingFace], Self::API_KEY => &[EmbedderSource::OpenAi], - Self::DIMENSIONS => &[EmbedderSource::UserProvided], + Self::DIMENSIONS => &[EmbedderSource::OpenAi, EmbedderSource::UserProvided], Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi], _other => unreachable!("unknown field"), } @@ -90,9 +91,13 @@ impl EmbeddingSettings { pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] { match source { - EmbedderSource::OpenAi => { - &[Self::SOURCE, Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE] - } + EmbedderSource::OpenAi => &[ + Self::SOURCE, + Self::MODEL, + Self::API_KEY, + Self::DOCUMENT_TEMPLATE, + Self::DIMENSIONS, + ], EmbedderSource::HuggingFace => { &[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE] } @@ -109,6 +114,17 @@ impl EmbeddingSettings { *source = Setting::Set(EmbedderSource::default()) } } + + pub(crate) fn apply_default_openai_model(setting: &mut Setting) { + if let Setting::Set(EmbeddingSettings { + source: Setting::Set(EmbedderSource::OpenAi), + model: model @ (Setting::NotSet | Setting::Reset), + .. + }) = setting + { + *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) + } + } } #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] From 74c180267ef1a32f9ca8943b762edfb78afab669 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 11:03:00 +0100 Subject: [PATCH 5/8] pass dimensions only when defined --- milli/src/vector/openai.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 8712c7894..9deb2e2da 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -271,11 +271,7 @@ impl Embedder { let request = OpenAiRequest { model: self.options.embedding_model.name(), input: texts, - dimension: if self.options.embedding_model.supports_overriding_dimensions() { - self.options.dimensions.as_ref() - } else { - None - }, + dimensions: self.overriden_dimensions(), }; let response = client .post(OPENAI_EMBEDDINGS_URL) @@ -360,8 +356,11 @@ impl Embedder { tokens: &[usize], client: &reqwest::Client, ) -> Result { - let request = - OpenAiTokensRequest { model: self.options.embedding_model.name(), input: tokens }; + let request = OpenAiTokensRequest { + model: self.options.embedding_model.name(), + input: tokens, + dimensions: self.overriden_dimensions(), + }; let response = client .post(OPENAI_EMBEDDINGS_URL) .json(&request) @@ -414,6 +413,14 @@ impl Embedder { pub fn distribution(&self) -> Option { self.options.embedding_model.distribution() } + + fn overriden_dimensions(&self) -> Option { + if self.options.embedding_model.supports_overriding_dimensions() { + self.options.dimensions + } else { + None + } + } } // retrying in case of failure @@ -473,13 +480,16 @@ impl Retry { struct OpenAiRequest<'a, S: AsRef + serde::Serialize> { model: &'a str, input: &'a [S], - dimension: Option<&'a usize>, + #[serde(skip_serializing_if = "Option::is_none")] + dimensions: Option, } #[derive(Debug, Serialize)] struct OpenAiTokensRequest<'a> { model: &'a str, input: &'a [usize], + #[serde(skip_serializing_if = "Option::is_none")] + dimensions: Option, } #[derive(Debug, Deserialize)] From 32ee05cceffc4cb3425283be67adc8a94d50824c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 11:48:19 +0100 Subject: [PATCH 6/8] Fix default dimensions for models --- milli/src/vector/openai.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index 9deb2e2da..f608f1d12 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -68,8 +68,8 @@ impl EmbeddingModel { pub fn default_dimensions(&self) -> usize { match self { EmbeddingModel::TextEmbeddingAda002 => 1536, - EmbeddingModel::TextEmbedding3Large => 1536, - EmbeddingModel::TextEmbedding3Small => 3072, + EmbeddingModel::TextEmbedding3Large => 3072, + EmbeddingModel::TextEmbedding3Small => 1536, } } From 88d03c56ab479a1ad0710a368f6735e171017f92 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 11:48:47 +0100 Subject: [PATCH 7/8] Don't accept dimensions of 0 (ever) or dimensions greater than the default dimensions of the model --- meilisearch-types/src/error.rs | 2 ++ milli/src/error.rs | 9 +++++++++ milli/src/update/settings.rs | 17 +++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 796eb5713..1b54e77c0 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -348,6 +348,8 @@ impl ErrorCode for milli::Error { | UserError::MissingFieldForSource { .. } | UserError::InvalidOpenAiModel { .. } | UserError::InvalidOpenAiModelDimensions { .. } + | UserError::InvalidOpenAiModelDimensionsMax { .. } + | UserError::InvalidSettingsDimensions { .. } | UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, diff --git a/milli/src/error.rs b/milli/src/error.rs index 9cb984db1..1147085dd 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -234,6 +234,15 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco dimensions: usize, expected_dimensions: usize, }, + #[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its dimensions to a value higher than {max_dimensions}. Found {dimensions}")] + InvalidOpenAiModelDimensionsMax { + embedder_name: String, + model: &'static str, + dimensions: usize, + max_dimensions: usize, + }, + #[error("`.embedders.{embedder_name}.dimensions`: `dimensions` cannot be zero")] + InvalidSettingsDimensions { embedder_name: String }, } impl From for Error { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index b8289626b..a3ba42119 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1122,6 +1122,14 @@ pub fn validate_embedding_settings( let Setting::Set(settings) = settings else { return Ok(settings) }; let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } = settings; + + if let Some(0) = dimensions.set() { + return Err(crate::error::UserError::InvalidSettingsDimensions { + embedder_name: name.to_owned(), + } + .into()); + } + let Some(inferred_source) = source.set() else { return Ok(Setting::Set(EmbeddingSettings { source, @@ -1153,6 +1161,15 @@ pub fn validate_embedding_settings( } .into()); } + if dimensions > model.default_dimensions() { + return Err(crate::error::UserError::InvalidOpenAiModelDimensionsMax { + embedder_name: name.to_owned(), + model: model.name(), + dimensions, + max_dimensions: model.default_dimensions(), + } + .into()); + } } } } From a1caac9bfbb469256151cf163086b81fecc5817f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 7 Feb 2024 14:22:13 +0100 Subject: [PATCH 8/8] Correct distribution shifts for new models --- milli/src/vector/openai.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/vector/openai.rs b/milli/src/vector/openai.rs index f608f1d12..104decb66 100644 --- a/milli/src/vector/openai.rs +++ b/milli/src/vector/openai.rs @@ -96,10 +96,10 @@ impl EmbeddingModel { Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) } EmbeddingModel::TextEmbedding3Large => { - Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) + Some(DistributionShift { current_mean: 0.70, current_sigma: 0.1 }) } EmbeddingModel::TextEmbedding3Small => { - Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) + Some(DistributionShift { current_mean: 0.75, current_sigma: 0.1 }) } } }