Merge pull request #5659 from meilisearch/tmp-release-v1.15.1

Bring back v1.15.0 and v1.15.1 changes
This commit is contained in:
Clément Renault
2025-06-12 09:16:56 +00:00
committed by GitHub
201 changed files with 4015 additions and 1188 deletions

View File

@@ -18,7 +18,7 @@ bincode = "1.3.3"
bstr = "1.11.3"
bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0"
charabia = { version = "0.9.3", default-features = false }
charabia = { version = "0.9.6", default-features = false }
concat-arrays = "0.1.2"
convert_case = "0.6.0"
crossbeam-channel = "0.5.15"

View File

@@ -1,9 +1,6 @@
use std::mem;
use heed::Database;
use heed::DatabaseStat;
use heed::RoTxn;
use heed::Unspecified;
use heed::{Database, DatabaseStat, RoTxn, Unspecified};
use serde::{Deserialize, Serialize};
use crate::BEU32;

View File

@@ -1,10 +1,9 @@
use heed::{
types::{SerdeJson, Str},
RoTxn, RwTxn,
};
use heed::types::{SerdeJson, Str};
use heed::{RoTxn, RwTxn};
use serde::{Deserialize, Serialize};
use crate::{index::main_key, Index};
use crate::index::main_key;
use crate::Index;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
@@ -33,13 +32,6 @@ impl Index {
Ok(())
}
pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
self.main
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
.delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
Ok(())
}
}
impl DisabledTyposTerms {

View File

@@ -1,5 +1,4 @@
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::collections::{BTreeSet, HashMap};
use std::convert::Infallible;
use std::fmt::Write;
use std::{io, str};
@@ -387,6 +386,8 @@ and can not be more than 511 bytes.", .document_id.to_string()
DocumentEditionRuntimeError(Box<EvalAltResult>),
#[error("Document edition runtime error encountered while compiling the function: {0}")]
DocumentEditionCompilationError(rhai::ParseError),
#[error("`.chat.documentTemplateMaxBytes`: `documentTemplateMaxBytes` cannot be zero")]
InvalidChatSettingsDocumentTemplateMaxBytes,
#[error("{0}")]
DocumentEmbeddingError(String),
}

View File

@@ -32,13 +32,13 @@ impl ExternalDocumentsIds {
&self,
rtxn: &RoTxn<'_>,
external_id: A,
) -> heed::Result<Option<u32>> {
) -> heed::Result<Option<DocumentId>> {
self.0.get(rtxn, external_id.as_ref())
}
/// An helper function to debug this type, returns an `HashMap` of both,
/// soft and hard fst maps, combined.
pub fn to_hash_map(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashMap<String, u32>> {
pub fn to_hash_map(&self, rtxn: &RoTxn<'_>) -> heed::Result<HashMap<String, DocumentId>> {
let mut map = HashMap::default();
for result in self.0.iter(rtxn)? {
let (external, internal) = result?;

View File

@@ -7,6 +7,7 @@ use crate::FieldId;
mod global;
pub mod metadata;
pub use global::GlobalFieldsIdsMap;
pub use metadata::{FieldIdMapWithMetadata, MetadataBuilder};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldsIdsMap {

View File

@@ -1,13 +1,12 @@
use std::collections::{BTreeSet, HashSet};
use deserr::{DeserializeError, Deserr, ValuePointerRef};
use serde::{Deserialize, Serialize};
use std::collections::{BTreeSet, HashSet};
use utoipa::ToSchema;
use crate::{
attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch},
constants::RESERVED_GEO_FIELD_NAME,
AttributePatterns,
};
use crate::attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch};
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::AttributePatterns;
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)]
#[serde(untagged)]

View File

@@ -1,14 +1,18 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::error::Error;
use std::fmt;
use std::fs::File;
use std::path::Path;
use deserr::Deserr;
use heed::types::*;
use heed::{CompactionOption, Database, DatabaseStat, RoTxn, RwTxn, Unspecified, WithoutTls};
use indexmap::IndexMap;
use roaring::RoaringBitmap;
use rstar::RTree;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use crate::constants::{self, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::database_stats::DatabaseStats;
@@ -23,7 +27,9 @@ use crate::heed_codec::facet::{
use crate::heed_codec::version::VersionCodec;
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
use crate::order_by_map::OrderByMap;
use crate::prompt::PromptData;
use crate::proximity::ProximityPrecision;
use crate::update::new::StdResult;
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@@ -79,6 +85,7 @@ pub mod main_key {
pub const PREFIX_SEARCH: &str = "prefix_search";
pub const DOCUMENTS_STATS: &str = "documents_stats";
pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms";
pub const CHAT: &str = "chat";
}
pub mod db_name {
@@ -1691,6 +1698,25 @@ impl Index {
self.main.remap_key_type::<Str>().delete(txn, main_key::FACET_SEARCH)
}
pub fn chat_config(&self, txn: &RoTxn<'_>) -> heed::Result<ChatConfig> {
self.main
.remap_types::<Str, SerdeJson<_>>()
.get(txn, main_key::CHAT)
.map(|o| o.unwrap_or_default())
}
pub(crate) fn put_chat_config(
&self,
txn: &mut RwTxn<'_>,
val: &ChatConfig,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<_>>().put(txn, main_key::CHAT, &val)
}
pub(crate) fn delete_chat_config(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(txn, main_key::CHAT)
}
pub fn localized_attributes_rules(
&self,
rtxn: &RoTxn<'_>,
@@ -1917,13 +1943,99 @@ pub struct IndexEmbeddingConfig {
pub user_provided: RoaringBitmap,
}
#[derive(Debug, Default, Deserialize, Serialize)]
pub struct ChatConfig {
pub description: String,
/// Contains the document template and max template length.
pub prompt: PromptData,
pub search_parameters: SearchParameters,
}
#[derive(Debug, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchParameters {
#[serde(skip_serializing_if = "Option::is_none")]
pub hybrid: Option<HybridQuery>,
#[serde(skip_serializing_if = "Option::is_none")]
pub limit: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub sort: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub distinct: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub matching_strategy: Option<MatchingStrategy>,
#[serde(skip_serializing_if = "Option::is_none")]
pub attributes_to_search_on: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub ranking_score_threshold: Option<RankingScoreThreshold>,
}
#[derive(Debug, Clone, Copy, Default, Deserialize, Serialize, PartialEq, Deserr, ToSchema)]
#[deserr(try_from(f64) = TryFrom::try_from -> InvalidSettingsRankingScoreThreshold)]
pub struct RankingScoreThreshold(f64);
impl RankingScoreThreshold {
pub fn as_f64(&self) -> f64 {
self.0
}
}
impl TryFrom<f64> for RankingScoreThreshold {
type Error = InvalidSettingsRankingScoreThreshold;
fn try_from(value: f64) -> StdResult<Self, Self::Error> {
if !(0.0..=1.0).contains(&value) {
Err(InvalidSettingsRankingScoreThreshold)
} else {
Ok(RankingScoreThreshold(value))
}
}
}
#[derive(Debug)]
pub struct InvalidSettingsRankingScoreThreshold;
impl Error for InvalidSettingsRankingScoreThreshold {}
impl fmt::Display for InvalidSettingsRankingScoreThreshold {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"the value of `rankingScoreThreshold` is invalid, expected a float between `0.0` and `1.0`."
)
}
}
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct HybridQuery {
pub semantic_ratio: f32,
pub embedder: String,
}
#[derive(Debug, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct PrefixSettings {
pub prefix_count_threshold: usize,
pub max_prefix_length: usize,
pub compute_prefixes: PrefixSearch,
}
/// This is unfortunately a duplication of the struct in <meilisearch/src/search/mod.rs>.
/// The reason why it is duplicated is because milli cannot depend on meilisearch. It would be cyclic imports.
#[derive(Default, Debug, Copy, Clone, PartialEq, Eq, Deserr, ToSchema, Serialize, Deserialize)]
#[deserr(rename_all = camelCase)]
#[serde(rename_all = "camelCase")]
pub enum MatchingStrategy {
/// Remove query words from last to first
#[default]
Last,
/// All query words are mandatory
All,
/// Remove query words from the most frequent to the least
Frequency,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
pub enum PrefixSearch {

View File

@@ -52,18 +52,19 @@ pub use search::new::{
};
use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed, rhai};
pub use {arroy, charabia as tokenizer, heed, rhai};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::AttributePatterns;
pub use self::attribute_patterns::PatternMatch;
pub use self::attribute_patterns::{AttributePatterns, PatternMatch};
pub use self::criterion::{default_criteria, Criterion, CriterionError};
pub use self::error::{
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
};
pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fieldids_weights_map::FieldidsWeightsMap;
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
pub use self::fields_ids_map::{
FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, MetadataBuilder,
};
pub use self::filterable_attributes_rules::{
FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns,
FilterableAttributesRule,
@@ -84,8 +85,6 @@ pub use self::search::{
};
pub use self::update::ChannelCongestion;
pub use arroy;
pub type Result<T> = std::result::Result<T, error::Error>;
pub type Attribute = u32;

View File

@@ -1,4 +1,3 @@
use enum_iterator::Sequence;
use std::any::TypeId;
use std::borrow::Cow;
use std::marker::PhantomData;
@@ -6,6 +5,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, RwLock};
use std::time::{Duration, Instant};
use enum_iterator::Sequence;
use indexmap::IndexMap;
use itertools::Itertools;
use serde::Serialize;

View File

@@ -18,6 +18,7 @@ impl NewPromptError {
Self { kind: NewPromptErrorKind::CannotParseTemplate(inner), fault: FaultSource::User }
}
#[allow(unused)] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
pub(crate) fn invalid_fields_in_template(inner: liquid::Error) -> NewPromptError {
Self { kind: NewPromptErrorKind::InvalidFieldsInTemplate(inner), fault: FaultSource::User }
}
@@ -27,6 +28,7 @@ impl NewPromptError {
pub enum NewPromptErrorKind {
#[error("cannot parse template: {0}")]
CannotParseTemplate(liquid::Error),
#[allow(unused)] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
#[error("template contains invalid fields: {0}. Only `doc.*`, `fields[i].name`, `fields[i].value` are supported")]
InvalidFieldsInTemplate(liquid::Error),
}

View File

@@ -2,7 +2,6 @@ mod context;
mod document;
pub(crate) mod error;
mod fields;
mod template_checker;
use std::cell::RefCell;
use std::convert::TryFrom;
@@ -65,7 +64,7 @@ fn default_template() -> liquid::Template {
new_template(default_template_text()).unwrap()
}
fn default_template_text() -> &'static str {
pub fn default_template_text() -> &'static str {
"{% for field in fields %}\
{% if field.is_searchable and field.value != nil %}\
{{ field.name }}: {{ field.value }}\n\
@@ -105,11 +104,6 @@ impl Prompt {
max_bytes,
};
// render template with special object that's OK with `doc.*` and `fields.*`
this.template
.render(&template_checker::TemplateChecker)
.map_err(NewPromptError::invalid_fields_in_template)?;
Ok(this)
}
@@ -206,6 +200,7 @@ mod test {
}
#[test]
#[ignore] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
fn template_missing_doc() {
assert!(matches!(
Prompt::new("{{title}}: {{overview}}".into(), None),
@@ -236,6 +231,7 @@ mod test {
}
#[test]
#[ignore] // See <https://github.com/meilisearch/meilisearch/pull/5593> for explanation
fn template_fields_invalid() {
assert!(matches!(
// intentionally garbled field

View File

@@ -1,301 +0,0 @@
use liquid::model::{
ArrayView, DisplayCow, KStringCow, ObjectRender, ObjectSource, State, Value as LiquidValue,
};
use liquid::{Object, ObjectView, ValueView};
#[derive(Debug)]
pub struct TemplateChecker;
#[derive(Debug)]
pub struct DummyDoc;
#[derive(Debug)]
pub struct DummyFields;
#[derive(Debug)]
pub struct DummyField;
const DUMMY_VALUE: &LiquidValue = &LiquidValue::Nil;
impl ObjectView for DummyField {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["name", "value"].iter().map(|s| KStringCow::from_static(s)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(vec![DUMMY_VALUE.as_view(), DUMMY_VALUE.as_view()].into_iter())
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "name" || index == "value"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
if self.contains_key(index) {
Some(DUMMY_VALUE.as_view())
} else {
None
}
}
}
impl ValueView for DummyField {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
let mut this = Object::new();
this.insert("name".into(), LiquidValue::Nil);
this.insert("value".into(), LiquidValue::Nil);
LiquidValue::Object(this)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}
impl ValueView for DummyFields {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"array"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Array(vec![DummyField.to_value()])
}
fn as_array(&self) -> Option<&dyn ArrayView> {
Some(self)
}
}
impl ArrayView for DummyFields {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
u16::MAX as i64
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(std::iter::once(DummyField.as_value()))
}
fn contains_key(&self, index: i64) -> bool {
index < self.size()
}
fn get(&self, _index: i64) -> Option<&dyn ValueView> {
Some(DummyField.as_value())
}
}
impl ObjectView for DummyDoc {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
1000
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(std::iter::empty())
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(std::iter::empty())
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(std::iter::empty())
}
fn contains_key(&self, _index: &str) -> bool {
true
}
fn get<'s>(&'s self, _index: &str) -> Option<&'s dyn ValueView> {
// Recursively sends itself
Some(self)
}
}
impl ValueView for DummyDoc {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> DisplayCow<'_> {
DUMMY_VALUE.render()
}
fn source(&self) -> DisplayCow<'_> {
DUMMY_VALUE.source()
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue => false,
State::Empty => false,
State::Blank => false,
}
}
fn to_kstr(&self) -> KStringCow<'_> {
DUMMY_VALUE.to_kstr()
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Nil
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}
impl ObjectView for TemplateChecker {
fn as_value(&self) -> &dyn ValueView {
self
}
fn size(&self) -> i64 {
2
}
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
}
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
Box::new(
std::iter::once(DummyDoc.as_value()).chain(std::iter::once(DummyFields.as_value())),
)
}
fn iter<'k>(&'k self) -> Box<dyn Iterator<Item = (KStringCow<'k>, &'k dyn ValueView)> + 'k> {
Box::new(self.keys().zip(self.values()))
}
fn contains_key(&self, index: &str) -> bool {
index == "doc" || index == "fields"
}
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
match index {
"doc" => Some(DummyDoc.as_value()),
"fields" => Some(DummyFields.as_value()),
_ => None,
}
}
}
impl ValueView for TemplateChecker {
fn as_debug(&self) -> &dyn std::fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectRender::new(self)))
}
fn source(&self) -> liquid::model::DisplayCow<'_> {
DisplayCow::Owned(Box::new(ObjectSource::new(self)))
}
fn type_name(&self) -> &'static str {
"object"
}
fn query_state(&self, state: liquid::model::State) -> bool {
match state {
State::Truthy => true,
State::DefaultValue | State::Empty | State::Blank => false,
}
}
fn to_kstr(&self) -> liquid::model::KStringCow<'_> {
let s = ObjectRender::new(self).to_string();
KStringCow::from_string(s)
}
fn to_value(&self) -> LiquidValue {
LiquidValue::Object(
self.iter().map(|(k, x)| (k.to_string().into(), x.to_value())).collect(),
)
}
fn as_object(&self) -> Option<&dyn ObjectView> {
Some(self)
}
}

View File

@@ -10,6 +10,7 @@ pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FAC
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
use crate::index::MatchingStrategy;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::Embedder;
use crate::{
@@ -364,6 +365,16 @@ impl Default for TermsMatchingStrategy {
}
}
impl From<MatchingStrategy> for TermsMatchingStrategy {
fn from(other: MatchingStrategy) -> Self {
match other {
MatchingStrategy::Last => Self::Last,
MatchingStrategy::All => Self::All,
MatchingStrategy::Frequency => Self::Frequency,
}
}
}
fn get_first(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],

View File

@@ -1,8 +1,9 @@
use std::collections::VecDeque;
use heed::types::{Bytes, Unit};
use heed::{RoPrefix, RoTxn};
use roaring::RoaringBitmap;
use rstar::RTree;
use std::collections::VecDeque;
use super::facet_string_values;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};

View File

@@ -1,7 +1,6 @@
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::BTreeSet;
use std::ops::ControlFlow;
use fst::automaton::Str;
use fst::{IntoStreamer, Streamer};
@@ -16,12 +15,6 @@ use crate::search::new::{limits, SearchContext};
use crate::search::{build_dfa, get_first};
use crate::{Result, MAX_WORD_LENGTH};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NumberOfTypos {
One,
Two,
}
impl Interned<QueryTerm> {
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
let s = ctx.term_interner.get_mut(self);
@@ -45,7 +38,7 @@ impl Interned<QueryTerm> {
fn find_zero_typo_prefix_derivations(
ctx: &mut SearchContext<'_>,
word_interned: Interned<String>,
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
prefix_of: &mut BTreeSet<Interned<String>>,
) -> Result<()> {
let word = ctx.word_interner.get(word_interned).to_owned();
let word = word.as_str();
@@ -65,8 +58,8 @@ fn find_zero_typo_prefix_derivations(
let derived_word = derived_word.to_string();
let derived_word_interned = ctx.word_interner.insert(derived_word);
if derived_word_interned != word_interned {
let cf = visit(derived_word_interned)?;
if cf.is_break() {
prefix_of.insert(derived_word_interned);
if prefix_of.len() >= limits::MAX_PREFIX_COUNT {
break;
}
}
@@ -81,7 +74,7 @@ fn find_one_typo_derivations(
ctx: &mut SearchContext<'_>,
word_interned: Interned<String>,
is_prefix: bool,
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
one_typo_words: &mut BTreeSet<Interned<String>>,
) -> Result<()> {
let fst = ctx.get_words_fst()?;
let word = ctx.word_interner.get(word_interned).to_owned();
@@ -98,8 +91,8 @@ fn find_one_typo_derivations(
1 => {
let derived_word = std::str::from_utf8(derived_word)?;
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
let cf = visit(derived_word)?;
if cf.is_break() {
one_typo_words.insert(derived_word);
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT {
break;
}
}
@@ -116,7 +109,8 @@ fn find_one_two_typo_derivations(
is_prefix: bool,
fst: fst::Set<Cow<'_, [u8]>>,
word_interner: &mut DedupInterner<String>,
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
one_typo_words: &mut BTreeSet<Interned<String>>,
two_typo_words: &mut BTreeSet<Interned<String>>,
) -> Result<()> {
let word = word_interner.get(word_interned).to_owned();
let word = word.as_str();
@@ -130,15 +124,20 @@ fn find_one_two_typo_derivations(
let mut stream = fst.search_with_state(automaton).into_stream();
while let Some((derived_word, state)) = stream.next() {
let finished_one_typo_words = one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT;
let finished_two_typo_words = two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT;
if finished_one_typo_words && finished_two_typo_words {
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
break;
}
let derived_word = std::str::from_utf8(derived_word)?;
let derived_word_interned = word_interner.insert(derived_word.to_owned());
// No need to intern here
// in the case the typo is on the first letter, we know the number of typo
// is two
if get_first(derived_word) != get_first(word) {
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
if cf.is_break() {
break;
}
if get_first(derived_word) != get_first(word) && !finished_two_typo_words {
let derived_word_interned = word_interner.insert(derived_word.to_owned());
two_typo_words.insert(derived_word_interned);
continue;
} else {
// Else, we know that it is the second dfa that matched and compute the
// correct distance
@@ -146,16 +145,18 @@ fn find_one_two_typo_derivations(
match d.to_u8() {
0 => (),
1 => {
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
if cf.is_break() {
break;
if finished_one_typo_words {
continue;
}
let derived_word_interned = word_interner.insert(derived_word.to_owned());
one_typo_words.insert(derived_word_interned);
}
2 => {
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
if cf.is_break() {
break;
if finished_two_typo_words {
continue;
}
let derived_word_interned = word_interner.insert(derived_word.to_owned());
two_typo_words.insert(derived_word_interned);
}
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
}
@@ -211,14 +212,7 @@ pub fn partially_initialized_term_from_word(
}
if is_prefix && use_prefix_db.is_none() {
find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
prefix_of.insert(derived_word);
Ok(ControlFlow::Continue(()))
} else {
Ok(ControlFlow::Break(()))
}
})?;
find_zero_typo_prefix_derivations(ctx, word_interned, &mut prefix_of)?;
}
let synonyms = ctx.index.synonyms(ctx.txn)?;
let mut synonym_word_count = 0;
@@ -281,14 +275,7 @@ impl Interned<QueryTerm> {
let mut one_typo_words = BTreeSet::new();
if *max_nbr_typos > 0 {
find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word);
Ok(ControlFlow::Continue(()))
} else {
Ok(ControlFlow::Break(()))
}
})?;
find_one_typo_derivations(ctx, original, is_prefix, &mut one_typo_words)?;
}
let split_words = if allows_split_words {
@@ -343,27 +330,8 @@ impl Interned<QueryTerm> {
*is_prefix,
ctx.index.words_fst(ctx.txn)?,
&mut ctx.word_interner,
|derived_word, nbr_typos| {
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
{
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
return Ok(ControlFlow::Break(()));
}
match nbr_typos {
NumberOfTypos::One => {
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word);
}
}
NumberOfTypos::Two => {
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
two_typo_words.insert(derived_word);
}
}
}
Ok(ControlFlow::Continue(()))
},
&mut one_typo_words,
&mut two_typo_words,
)?;
}

View File

@@ -0,0 +1,182 @@
use std::error::Error;
use std::fmt;
use deserr::errors::JsonError;
use deserr::Deserr;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
use crate::index::{self, ChatConfig, MatchingStrategy, RankingScoreThreshold, SearchParameters};
use crate::prompt::{default_max_bytes, PromptData};
use crate::update::Setting;
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Deserr, ToSchema)]
#[serde(deny_unknown_fields, rename_all = "camelCase")]
#[deserr(error = JsonError, deny_unknown_fields, rename_all = camelCase)]
pub struct ChatSettings {
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
pub description: Setting<String>,
/// A liquid template used to render documents to a text that can be embedded.
///
/// Meillisearch interpolates the template for each document and sends the resulting text to the embedder.
/// The embedder then generates document vectors based on this text.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
pub document_template: Setting<String>,
/// Rendered texts are truncated to this size. Defaults to 400.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<usize>)]
pub document_template_max_bytes: Setting<usize>,
/// The search parameters to use for the LLM.
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<ChatSearchParams>)]
pub search_parameters: Setting<ChatSearchParams>,
}
impl From<ChatConfig> for ChatSettings {
fn from(config: ChatConfig) -> Self {
let ChatConfig {
description,
prompt: PromptData { template, max_bytes },
search_parameters,
} = config;
ChatSettings {
description: Setting::Set(description),
document_template: Setting::Set(template),
document_template_max_bytes: Setting::Set(
max_bytes.unwrap_or(default_max_bytes()).get(),
),
search_parameters: Setting::Set({
let SearchParameters {
hybrid,
limit,
sort,
distinct,
matching_strategy,
attributes_to_search_on,
ranking_score_threshold,
} = search_parameters;
let hybrid = hybrid.map(|index::HybridQuery { semantic_ratio, embedder }| {
HybridQuery { semantic_ratio: SemanticRatio(semantic_ratio), embedder }
});
ChatSearchParams {
hybrid: Setting::some_or_not_set(hybrid),
limit: Setting::some_or_not_set(limit),
sort: Setting::some_or_not_set(sort),
distinct: Setting::some_or_not_set(distinct),
matching_strategy: Setting::some_or_not_set(matching_strategy),
attributes_to_search_on: Setting::some_or_not_set(attributes_to_search_on),
ranking_score_threshold: Setting::some_or_not_set(ranking_score_threshold),
}
}),
}
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Deserr, ToSchema)]
#[serde(deny_unknown_fields, rename_all = "camelCase")]
#[deserr(error = JsonError, deny_unknown_fields, rename_all = camelCase)]
pub struct ChatSearchParams {
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<HybridQuery>)]
pub hybrid: Setting<HybridQuery>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default = Setting::Set(20))]
#[schema(value_type = Option<usize>)]
pub limit: Setting<usize>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<Vec<String>>)]
pub sort: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
pub distinct: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<MatchingStrategy>)]
pub matching_strategy: Setting<MatchingStrategy>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<Vec<String>>)]
pub attributes_to_search_on: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<RankingScoreThreshold>)]
pub ranking_score_threshold: Setting<RankingScoreThreshold>,
}
#[derive(Debug, Clone, Default, Deserr, ToSchema, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
#[deserr(error = JsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct HybridQuery {
#[deserr(default)]
#[serde(default)]
#[schema(default, value_type = f32)]
pub semantic_ratio: SemanticRatio,
#[schema(value_type = String)]
pub embedder: String,
}
#[derive(Debug, Clone, Copy, Deserr, ToSchema, PartialEq, Serialize, Deserialize)]
#[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
pub struct SemanticRatio(f32);
impl Default for SemanticRatio {
fn default() -> Self {
SemanticRatio(0.5)
}
}
impl std::convert::TryFrom<f32> for SemanticRatio {
type Error = InvalidSearchSemanticRatio;
fn try_from(f: f32) -> Result<Self, Self::Error> {
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
#[allow(clippy::manual_range_contains)]
if f > 1.0 || f < 0.0 {
Err(InvalidSearchSemanticRatio)
} else {
Ok(SemanticRatio(f))
}
}
}
#[derive(Debug)]
pub struct InvalidSearchSemanticRatio;
impl Error for InvalidSearchSemanticRatio {}
impl fmt::Display for InvalidSearchSemanticRatio {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`."
)
}
}
impl std::ops::Deref for SemanticRatio {
type Target = f32;
fn deref(&self) -> &Self::Target {
&self.0
}
}

View File

@@ -1562,12 +1562,12 @@ mod tests {
let rtxn = index.read_txn().unwrap();
// Only the first document should match.
let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len();
let count = index.word_docids.get(&rtxn, "huàzhuāng").unwrap().unwrap().len();
assert_eq!(count, 1);
// Only the second document should match.
let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len();
assert_eq!(count, 1);
assert_eq!(count, 2);
let mut search = crate::Search::new(&rtxn, &index);
search.query("化妆包");

View File

@@ -1,4 +1,5 @@
pub use self::available_ids::AvailableIds;
pub use self::chat::ChatSettings;
pub use self::clear_documents::ClearDocuments;
pub use self::concurrent_available_ids::ConcurrentAvailableIds;
pub use self::facet::bulk::FacetsUpdateBulk;
@@ -13,6 +14,7 @@ pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
pub use self::words_prefixes_fst::WordsPrefixesFst;
mod available_ids;
mod chat;
mod clear_documents;
mod concurrent_available_ids;
pub(crate) mod del_add;

View File

@@ -4,6 +4,7 @@ use serde_json::Value;
use crate::attribute_patterns::PatternMatch;
use crate::fields_ids_map::metadata::Metadata;
use crate::filterable_attributes_rules::match_faceted_field;
use crate::update::new::document::Document;
use crate::update::new::extract::geo::extract_geo_coordinates;
use crate::update::new::extract::perm_json_p;
@@ -11,8 +12,6 @@ use crate::{
FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError,
};
use crate::filterable_attributes_rules::match_faceted_field;
#[allow(clippy::too_many_arguments)]
pub fn extract_document_facets<'doc>(
document: impl Document<'doc>,

View File

@@ -18,7 +18,8 @@ pub use vectors::EmbeddingExtractor;
pub mod perm_json_p {
use serde_json::{Map, Value};
use crate::{attribute_patterns::PatternMatch, Result};
use crate::attribute_patterns::PatternMatch;
use crate::Result;
const SPLIT_SYMBOL: char = '.';
/// Returns `true` if the `selector` match the `key`.

View File

@@ -131,7 +131,12 @@ fn compute_word_fst(
}
}
pub fn recompute_word_fst_from_word_docids_database(index: &Index, wtxn: &mut RwTxn) -> Result<()> {
pub fn recompute_word_fst_from_word_docids_database(
index: &Index,
wtxn: &mut RwTxn,
progress: &Progress,
) -> Result<()> {
progress.update_progress(PostProcessingWords::WordFst);
let fst = fst::Set::default().map_data(std::borrow::Cow::Owned)?;
let mut word_fst_builder = WordFstBuilder::new(&fst)?;
let words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>();

View File

@@ -11,22 +11,23 @@ use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime;
use super::chat::ChatSearchParams;
use super::del_add::{DelAdd, DelAddOperation};
use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use super::{ChatSettings, IndexerConfig};
use crate::attribute_patterns::PatternMatch;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::criterion::Criterion;
use crate::disabled_typos_terms::DisabledTyposTerms;
use crate::error::UserError;
use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes};
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::filterable_attributes_rules::match_faceted_field;
use crate::index::{
IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters,
DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::order_by_map::OrderByMap;
use crate::prompt::default_max_bytes;
use crate::prompt::{default_max_bytes, default_template_text, PromptData};
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep};
@@ -185,6 +186,7 @@ pub struct Settings<'a, 't, 'i> {
localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>,
prefix_search: Setting<PrefixSearch>,
facet_search: Setting<bool>,
chat: Setting<ChatSettings>,
}
impl<'a, 't, 'i> Settings<'a, 't, 'i> {
@@ -223,6 +225,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
localized_attributes_rules: Setting::NotSet,
prefix_search: Setting::NotSet,
facet_search: Setting::NotSet,
chat: Setting::NotSet,
indexer_config,
}
}
@@ -453,6 +456,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.facet_search = Setting::Reset;
}
pub fn set_chat(&mut self, value: ChatSettings) {
self.chat = Setting::Set(value);
}
pub fn reset_chat(&mut self) {
self.chat = Setting::Reset;
}
#[tracing::instrument(
level = "trace"
skip(self, progress_callback, should_abort, settings_diff),
@@ -884,7 +895,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
disabled_typos_terms.disable_on_numbers = disable_on_numbers;
}
Setting::Reset => {
self.index.delete_disabled_typos_terms(self.wtxn)?;
disabled_typos_terms.disable_on_numbers =
DisabledTyposTerms::default().disable_on_numbers;
}
@@ -1239,6 +1249,112 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(())
}
fn update_chat_config(&mut self) -> Result<bool> {
match &mut self.chat {
Setting::Set(ChatSettings {
description: new_description,
document_template: new_document_template,
document_template_max_bytes: new_document_template_max_bytes,
search_parameters: new_search_parameters,
}) => {
let ChatConfig { description, prompt, search_parameters } =
self.index.chat_config(self.wtxn)?;
let description = match new_description {
Setting::Set(new) => new.clone(),
Setting::Reset => Default::default(),
Setting::NotSet => description,
};
let prompt = PromptData {
template: match new_document_template {
Setting::Set(new) => new.clone(),
Setting::Reset => default_template_text().to_string(),
Setting::NotSet => prompt.template.clone(),
},
max_bytes: match new_document_template_max_bytes {
Setting::Set(m) => Some(
NonZeroUsize::new(*m)
.ok_or(InvalidChatSettingsDocumentTemplateMaxBytes)?,
),
Setting::Reset => Some(default_max_bytes()),
Setting::NotSet => prompt.max_bytes,
},
};
let search_parameters = match new_search_parameters {
Setting::Set(sp) => {
let ChatSearchParams {
hybrid,
limit,
sort,
distinct,
matching_strategy,
attributes_to_search_on,
ranking_score_threshold,
} = sp;
SearchParameters {
hybrid: match hybrid {
Setting::Set(hybrid) => Some(crate::index::HybridQuery {
semantic_ratio: *hybrid.semantic_ratio,
embedder: hybrid.embedder.clone(),
}),
Setting::Reset => None,
Setting::NotSet => search_parameters.hybrid.clone(),
},
limit: match limit {
Setting::Set(limit) => Some(*limit),
Setting::Reset => None,
Setting::NotSet => search_parameters.limit,
},
sort: match sort {
Setting::Set(sort) => Some(sort.clone()),
Setting::Reset => None,
Setting::NotSet => search_parameters.sort.clone(),
},
distinct: match distinct {
Setting::Set(distinct) => Some(distinct.clone()),
Setting::Reset => None,
Setting::NotSet => search_parameters.distinct.clone(),
},
matching_strategy: match matching_strategy {
Setting::Set(matching_strategy) => Some(*matching_strategy),
Setting::Reset => None,
Setting::NotSet => search_parameters.matching_strategy,
},
attributes_to_search_on: match attributes_to_search_on {
Setting::Set(attributes_to_search_on) => {
Some(attributes_to_search_on.clone())
}
Setting::Reset => None,
Setting::NotSet => {
search_parameters.attributes_to_search_on.clone()
}
},
ranking_score_threshold: match ranking_score_threshold {
Setting::Set(rst) => Some(*rst),
Setting::Reset => None,
Setting::NotSet => search_parameters.ranking_score_threshold,
},
}
}
Setting::Reset => Default::default(),
Setting::NotSet => search_parameters,
};
self.index.put_chat_config(
self.wtxn,
&ChatConfig { description, prompt, search_parameters },
)?;
Ok(true)
}
Setting::Reset => self.index.delete_chat_config(self.wtxn).map_err(Into::into),
Setting::NotSet => Ok(false),
}
}
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
where
FP: Fn(UpdateIndexingStep) + Sync,
@@ -1276,6 +1392,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_facet_search()?;
self.update_localized_attributes_rules()?;
self.update_disabled_typos_terms()?;
self.update_chat_config()?;
let embedding_config_updates = self.update_embedding_configs()?;

View File

@@ -897,6 +897,7 @@ fn test_correct_settings_init() {
prefix_search,
facet_search,
disable_on_numbers,
chat,
} = settings;
assert!(matches!(searchable_fields, Setting::NotSet));
assert!(matches!(displayed_fields, Setting::NotSet));
@@ -925,6 +926,7 @@ fn test_correct_settings_init() {
assert!(matches!(prefix_search, Setting::NotSet));
assert!(matches!(facet_search, Setting::NotSet));
assert!(matches!(disable_on_numbers, Setting::NotSet));
assert!(matches!(chat, Setting::NotSet));
})
.unwrap();
}

View File

@@ -3,7 +3,7 @@ use heed::RwTxn;
use super::UpgradeIndex;
use crate::progress::Progress;
use crate::update::new::indexer::recompute_word_fst_from_word_docids_database;
use crate::{make_enum_progress, Index, Result};
use crate::{Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct Latest_V1_14_To_Latest_V1_15();
@@ -17,14 +17,7 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
progress: Progress,
) -> Result<bool> {
// Recompute the word FST from the word docids database.
make_enum_progress! {
enum TypoTolerance {
RecomputeWordFst,
}
};
progress.update_progress(TypoTolerance::RecomputeWordFst);
recompute_word_fst_from_word_docids_database(index, wtxn)?;
recompute_word_fst_from_word_docids_database(index, wtxn, &progress)?;
Ok(false)
}

View File

@@ -33,6 +33,7 @@ pub struct EmbeddingSettings {
///
/// - Defaults to `openAi`
pub source: Setting<EmbedderSource>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
@@ -55,6 +56,7 @@ pub struct EmbeddingSettings {
/// - For source `openAi`, defaults to `text-embedding-3-small`
/// - For source `huggingFace`, defaults to `BAAI/bge-base-en-v1.5`
pub model: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
@@ -75,6 +77,7 @@ pub struct EmbeddingSettings {
/// - When `model` is set to default, defaults to `617ca489d9e86b49b8167676d8220688b99db36e`
/// - Otherwise, defaults to `null`
pub revision: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<OverridePooling>)]
@@ -96,6 +99,7 @@ pub struct EmbeddingSettings {
///
/// - Embedders created before this parameter was available default to `forceMean` to preserve the existing behavior.
pub pooling: Setting<OverridePooling>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
@@ -118,6 +122,7 @@ pub struct EmbeddingSettings {
///
/// - This setting is partially hidden when returned by the settings
pub api_key: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
@@ -141,6 +146,7 @@ pub struct EmbeddingSettings {
/// - For source `openAi`, the dimensions is the maximum allowed by the model.
/// - For sources `ollama` and `rest`, the dimensions are inferred by embedding a sample text.
pub dimensions: Setting<usize>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<bool>)]
@@ -167,6 +173,7 @@ pub struct EmbeddingSettings {
/// first enabling it. If you are unsure of whether the performance-relevancy tradeoff is right for you,
/// we recommend to use this parameter on a test index first.
pub binary_quantized: Setting<bool>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<bool>)]
@@ -183,6 +190,7 @@ pub struct EmbeddingSettings {
///
/// - 🏗️ When modified, embeddings are regenerated for documents whose rendering through the template produces a different text.
pub document_template: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<usize>)]
@@ -201,6 +209,7 @@ pub struct EmbeddingSettings {
///
/// - Defaults to 400
pub document_template_max_bytes: Setting<usize>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<String>)]
@@ -219,6 +228,7 @@ pub struct EmbeddingSettings {
/// - 🌱 When modified for source `openAi`, embeddings are never regenerated
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
pub url: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)]
@@ -236,6 +246,7 @@ pub struct EmbeddingSettings {
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
pub request: Setting<serde_json::Value>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<serde_json::Value>)]
@@ -253,6 +264,7 @@ pub struct EmbeddingSettings {
///
/// - 🏗️ Changing the value of this parameter always regenerates embeddings
pub response: Setting<serde_json::Value>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default)]
#[schema(value_type = Option<BTreeMap<String, String>>)]