mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-18 12:20:48 +00:00
Compare commits
10 Commits
reduce-pre
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
ed826a8c8b | |||
d2ef1cb425 | |||
bb389276aa | |||
e36a8c50b9 | |||
08ff135ad6 | |||
f729864466 | |||
94ea263bef | |||
0e475cb5e6 | |||
62de70b73c | |||
7707fb18dd |
@ -373,6 +373,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
|
||||
},
|
||||
disable_on_words: typo.disable_on_words.into(),
|
||||
disable_on_attributes: typo.disable_on_attributes.into(),
|
||||
disable_on_numbers: v6::Setting::NotSet,
|
||||
}),
|
||||
v5::Setting::Reset => v6::Setting::Reset,
|
||||
v5::Setting::NotSet => v6::Setting::NotSet,
|
||||
|
@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
|
||||
}
|
||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||
UserError::InvalidVectorDimensions { .. }
|
||||
| UserError::InvalidIndexingVectorDimensions { .. } => {
|
||||
Code::InvalidVectorDimensions
|
||||
}
|
||||
UserError::InvalidVectorsMapType { .. }
|
||||
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||
|
@ -8,6 +8,7 @@ use std::str::FromStr;
|
||||
|
||||
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
|
||||
use fst::IntoStreamer;
|
||||
use milli::disabled_typos_terms::DisabledTyposTerms;
|
||||
use milli::index::{IndexEmbeddingConfig, PrefixSearch};
|
||||
use milli::proximity::ProximityPrecision;
|
||||
use milli::update::Setting;
|
||||
@ -104,6 +105,10 @@ pub struct TypoSettings {
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeSet<String>>, example = json!(["uuid", "url"]))]
|
||||
pub disable_on_attributes: Setting<BTreeSet<String>>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<bool>, example = json!(true))]
|
||||
pub disable_on_numbers: Setting<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)]
|
||||
@ -701,6 +706,12 @@ pub fn apply_settings_to_builder(
|
||||
Setting::Reset => builder.reset_exact_attributes(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
match value.disable_on_numbers {
|
||||
Setting::Set(val) => builder.set_disable_on_numbers(val),
|
||||
Setting::Reset => builder.reset_disable_on_numbers(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
}
|
||||
Setting::Reset => {
|
||||
// all typo settings need to be reset here.
|
||||
@ -826,12 +837,14 @@ pub fn settings(
|
||||
};
|
||||
|
||||
let disabled_attributes = index.exact_attributes(rtxn)?.into_iter().map(String::from).collect();
|
||||
let DisabledTyposTerms { disable_on_numbers } = index.disabled_typos_terms(rtxn)?;
|
||||
|
||||
let typo_tolerance = TypoSettings {
|
||||
enabled: Setting::Set(index.authorize_typos(rtxn)?),
|
||||
min_word_size_for_typos: Setting::Set(min_typo_word_len),
|
||||
disable_on_words: Setting::Set(disabled_words),
|
||||
disable_on_attributes: Setting::Set(disabled_attributes),
|
||||
disable_on_numbers: Setting::Set(disable_on_numbers),
|
||||
};
|
||||
|
||||
let faceting = FacetingSettings {
|
||||
|
@ -87,7 +87,8 @@ async fn import_dump_v1_movie_raw() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -260,7 +261,8 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -432,7 +434,8 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -590,7 +593,8 @@ async fn import_dump_v2_movie_raw() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -760,7 +764,8 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -929,7 +934,8 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1087,7 +1093,8 @@ async fn import_dump_v3_movie_raw() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1257,7 +1264,8 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1426,7 +1434,8 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1584,7 +1593,8 @@ async fn import_dump_v4_movie_raw() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1754,7 +1764,8 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -1923,7 +1934,8 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -2212,7 +2224,8 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -2444,7 +2457,8 @@ async fn generate_and_import_dump_containing_vectors() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
|
@ -1976,3 +1976,93 @@ async fn change_facet_casing() {
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn test_exact_typos_terms() {
|
||||
let documents = json!([
|
||||
{
|
||||
"id": 0,
|
||||
"title": "The zeroth document 1298484",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"title": "The first document 234342",
|
||||
"nested": {
|
||||
"object": "field 22231",
|
||||
"machin": "bidule 23443.32111",
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "The second document 3398499",
|
||||
"nested": [
|
||||
"array",
|
||||
{
|
||||
"object": "field 23245121,23223",
|
||||
},
|
||||
{
|
||||
"prout": "truc 123980612321",
|
||||
"machin": "lol 12345645333447879",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "The third document 12333",
|
||||
"nested": "I lied 98878",
|
||||
},
|
||||
]);
|
||||
|
||||
// Test prefix search
|
||||
test_settings_documents_indexing_swapping_and_search(
|
||||
&documents,
|
||||
&json!({
|
||||
"searchableAttributes": ["title", "nested.object", "nested.machin"],
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"disableOnNumbers": true
|
||||
}
|
||||
}),
|
||||
&json!({"q": "12345"}),
|
||||
|response, code| {
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 2,
|
||||
"title": "The second document 3398499",
|
||||
"nested": [
|
||||
"array",
|
||||
{
|
||||
"object": "field 23245121,23223"
|
||||
},
|
||||
{
|
||||
"prout": "truc 123980612321",
|
||||
"machin": "lol 12345645333447879"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
"###);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
// Test typo search
|
||||
test_settings_documents_indexing_swapping_and_search(
|
||||
&documents,
|
||||
&json!({
|
||||
"searchableAttributes": ["title", "nested.object", "nested.machin"],
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"disableOnNumbers": true
|
||||
}
|
||||
}),
|
||||
&json!({"q": "123457"}),
|
||||
|response, code| {
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
snapshot!(json_string!(response["hits"]), @r###"[]"###);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
@ -274,7 +274,7 @@ async fn settings_bad_typo_tolerance() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`",
|
||||
"message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`, `disableOnNumbers`",
|
||||
"code": "invalid_settings_typo_tolerance",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_settings_typo_tolerance"
|
||||
|
@ -276,7 +276,7 @@ async fn secrets_are_hidden_in_settings() {
|
||||
|
||||
let (response, code) = index.settings().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r#"
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
@ -308,7 +308,8 @@ async fn secrets_are_hidden_in_settings() {
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
"disableOnAttributes": [],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
@ -337,7 +338,7 @@ async fn secrets_are_hidden_in_settings() {
|
||||
"facetSearch": true,
|
||||
"prefixSearch": "indexingTime"
|
||||
}
|
||||
"#);
|
||||
"###);
|
||||
|
||||
let (response, code) = server.get_task(settings_update_uid).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
|
@ -1,6 +1,5 @@
|
||||
---
|
||||
source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
|
||||
snapshot_kind: text
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
@ -49,7 +48,8 @@ snapshot_kind: text
|
||||
],
|
||||
"disableOnAttributes": [
|
||||
"surname"
|
||||
]
|
||||
],
|
||||
"disableOnNumbers": false
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 99,
|
||||
|
@ -164,6 +164,87 @@ async fn add_remove_user_provided() {
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn user_provide_mismatched_embedding_dimension() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
}
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
|
||||
]);
|
||||
let (value, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = index.wait_task(value.uid()).await;
|
||||
snapshot!(task, @r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"batchUid": "[batch_uid]",
|
||||
"indexUid": "doggo",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||
"code": "invalid_vector_dimensions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let new_document = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
|
||||
]);
|
||||
let (response, code) = index.add_documents(new_document, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = index.wait_task(response.uid()).await;
|
||||
snapshot!(task, @r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"batchUid": "[batch_uid]",
|
||||
"indexUid": "doggo",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||
"code": "invalid_vector_dimensions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
||||
let index = server.index("doggo");
|
||||
|
||||
|
50
crates/milli/src/disabled_typos_terms.rs
Normal file
50
crates/milli/src/disabled_typos_terms.rs
Normal file
@ -0,0 +1,50 @@
|
||||
use heed::{
|
||||
types::{SerdeJson, Str},
|
||||
RoTxn, RwTxn,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{index::main_key, Index};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DisabledTyposTerms {
|
||||
pub disable_on_numbers: bool,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn disabled_typos_terms(&self, txn: &RoTxn<'_>) -> heed::Result<DisabledTyposTerms> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
|
||||
.get(txn, main_key::DISABLED_TYPOS_TERMS)
|
||||
.map(|option| option.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_disabled_typos_terms(
|
||||
&self,
|
||||
txn: &mut RwTxn<'_>,
|
||||
disabled_typos_terms: &DisabledTyposTerms,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<DisabledTyposTerms>>().put(
|
||||
txn,
|
||||
main_key::DISABLED_TYPOS_TERMS,
|
||||
&disabled_typos_terms,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
|
||||
self.main
|
||||
.remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
|
||||
.delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl DisabledTyposTerms {
|
||||
pub fn is_exact(&self, word: &str) -> bool {
|
||||
// If disable_on_numbers is true, we disable the word if it contains only numbers or punctuation
|
||||
self.disable_on_numbers && word.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
|
||||
}
|
||||
}
|
@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
|
||||
InvalidIndexingVectorDimensions {
|
||||
embedder_name: String,
|
||||
document_id: String,
|
||||
embedding_index: usize,
|
||||
expected: usize,
|
||||
found: usize,
|
||||
},
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: String, value: Value },
|
||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||
|
@ -78,6 +78,7 @@ pub mod main_key {
|
||||
pub const FACET_SEARCH: &str = "facet_search";
|
||||
pub const PREFIX_SEARCH: &str = "prefix_search";
|
||||
pub const DOCUMENTS_STATS: &str = "documents_stats";
|
||||
pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms";
|
||||
}
|
||||
|
||||
pub mod db_name {
|
||||
|
@ -12,6 +12,7 @@ mod asc_desc;
|
||||
mod attribute_patterns;
|
||||
mod criterion;
|
||||
pub mod database_stats;
|
||||
pub mod disabled_typos_terms;
|
||||
mod error;
|
||||
mod external_documents_ids;
|
||||
pub mod facet;
|
||||
|
@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid)
|
||||
|| settings_diff.old.disabled_typos_terms.is_exact(&w);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid)
|
||||
|| settings_diff.new.disabled_typos_terms.is_exact(&w);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
|
@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
let clonable_exact_word_docids =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
|
@ -319,8 +319,11 @@ impl WordDocidsExtractors {
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||
let is_exact_attribute =
|
||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
|
||||
let is_exact = |fname: &str, word: &str| {
|
||||
exact_attributes.iter().any(|attr| contained_in(fname, attr))
|
||||
|| disabled_typos_terms.is_exact(word)
|
||||
};
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
@ -328,7 +331,7 @@ impl WordDocidsExtractors {
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
@ -356,7 +359,7 @@ impl WordDocidsExtractors {
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
@ -372,7 +375,7 @@ impl WordDocidsExtractors {
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
@ -389,7 +392,7 @@ impl WordDocidsExtractors {
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
is_exact(fname, word),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
|
@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
// do we have set embeddings?
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
update.external_document_id(),
|
||||
update.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
document_id: update.external_document_id().to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
} else if new_vectors.regenerate {
|
||||
let new_rendered = prompt.render_document(
|
||||
update.external_document_id(),
|
||||
@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
insertion.external_document_id(),
|
||||
insertion.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
.to_string(),
|
||||
error: error.to_string(),
|
||||
})?,
|
||||
);
|
||||
)?;
|
||||
} else if new_vectors.regenerate {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.external_document_id(),
|
||||
@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
|
||||
embedder: &'a Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &'a str,
|
||||
dimensions: usize,
|
||||
prompt: &'a Prompt,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let dimensions = embedder.dimensions();
|
||||
Self {
|
||||
texts,
|
||||
ids,
|
||||
@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
embedder_name,
|
||||
user_provided,
|
||||
has_manual_generation: None,
|
||||
dimensions,
|
||||
}
|
||||
}
|
||||
|
||||
@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
||||
}
|
||||
}
|
||||
|
||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
||||
fn set_vectors(
|
||||
&self,
|
||||
external_docid: &'a str,
|
||||
docid: DocumentId,
|
||||
embeddings: Vec<Embedding>,
|
||||
) -> Result<()> {
|
||||
for (embedding_index, embedding) in embeddings.iter().enumerate() {
|
||||
if embedding.len() != self.dimensions {
|
||||
return Err(UserError::InvalidIndexingVectorDimensions {
|
||||
expected: self.dimensions,
|
||||
found: embedding.len(),
|
||||
embedder_name: self.embedder_name.to_string(),
|
||||
document_id: external_docid.to_string(),
|
||||
embedding_index,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ use super::IndexerConfig;
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::criterion::Criterion;
|
||||
use crate::disabled_typos_terms::DisabledTyposTerms;
|
||||
use crate::error::UserError;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> {
|
||||
synonyms: Setting<BTreeMap<String, Vec<String>>>,
|
||||
primary_key: Setting<String>,
|
||||
authorize_typos: Setting<bool>,
|
||||
disable_on_numbers: Setting<bool>,
|
||||
min_word_len_two_typos: Setting<u8>,
|
||||
min_word_len_one_typo: Setting<u8>,
|
||||
exact_words: Setting<BTreeSet<String>>,
|
||||
@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
synonyms: Setting::NotSet,
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
disable_on_numbers: Setting::NotSet,
|
||||
exact_words: Setting::NotSet,
|
||||
min_word_len_two_typos: Setting::NotSet,
|
||||
min_word_len_one_typo: Setting::NotSet,
|
||||
@ -354,6 +357,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
self.min_word_len_one_typo = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_disable_on_numbers(&mut self, disable_on_numbers: bool) {
|
||||
self.disable_on_numbers = Setting::Set(disable_on_numbers);
|
||||
}
|
||||
|
||||
pub fn reset_disable_on_numbers(&mut self) {
|
||||
self.disable_on_numbers = Setting::Reset;
|
||||
}
|
||||
|
||||
pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
|
||||
self.exact_words = Setting::Set(words);
|
||||
}
|
||||
@ -866,6 +877,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_disabled_typos_terms(&mut self) -> Result<()> {
|
||||
let mut disabled_typos_terms = self.index.disabled_typos_terms(self.wtxn)?;
|
||||
match self.disable_on_numbers {
|
||||
Setting::Set(disable_on_numbers) => {
|
||||
disabled_typos_terms.disable_on_numbers = disable_on_numbers;
|
||||
}
|
||||
Setting::Reset => {
|
||||
self.index.delete_disabled_typos_terms(self.wtxn)?;
|
||||
disabled_typos_terms.disable_on_numbers =
|
||||
DisabledTyposTerms::default().disable_on_numbers;
|
||||
}
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_exact_words(&mut self) -> Result<()> {
|
||||
match self.exact_words {
|
||||
Setting::Set(ref mut words) => {
|
||||
@ -1246,6 +1275,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
self.update_prefix_search()?;
|
||||
self.update_facet_search()?;
|
||||
self.update_localized_attributes_rules()?;
|
||||
self.update_disabled_typos_terms()?;
|
||||
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
|
||||
@ -1327,6 +1357,7 @@ impl InnerIndexSettingsDiff {
|
||||
|| old_settings.prefix_search != new_settings.prefix_search
|
||||
|| old_settings.localized_attributes_rules
|
||||
!= new_settings.localized_attributes_rules
|
||||
|| old_settings.disabled_typos_terms != new_settings.disabled_typos_terms
|
||||
};
|
||||
|
||||
let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
|
||||
@ -1526,6 +1557,7 @@ pub(crate) struct InnerIndexSettings {
|
||||
pub user_defined_searchable_attributes: Option<Vec<String>>,
|
||||
pub sortable_fields: HashSet<String>,
|
||||
pub exact_attributes: HashSet<FieldId>,
|
||||
pub disabled_typos_terms: DisabledTyposTerms,
|
||||
pub proximity_precision: ProximityPrecision,
|
||||
pub embedding_configs: EmbeddingConfigs,
|
||||
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
@ -1574,7 +1606,7 @@ impl InnerIndexSettings {
|
||||
.map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
|
||||
let builder = MetadataBuilder::from_index(index, rtxn)?;
|
||||
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
|
||||
|
||||
let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
|
||||
Ok(Self {
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
@ -1592,6 +1624,7 @@ impl InnerIndexSettings {
|
||||
geo_fields_ids,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
disabled_typos_terms,
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -896,6 +896,7 @@ fn test_correct_settings_init() {
|
||||
localized_attributes_rules,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
disable_on_numbers,
|
||||
} = settings;
|
||||
assert!(matches!(searchable_fields, Setting::NotSet));
|
||||
assert!(matches!(displayed_fields, Setting::NotSet));
|
||||
@ -923,6 +924,7 @@ fn test_correct_settings_init() {
|
||||
assert!(matches!(localized_attributes_rules, Setting::NotSet));
|
||||
assert!(matches!(prefix_search, Setting::NotSet));
|
||||
assert!(matches!(facet_search, Setting::NotSet));
|
||||
assert!(matches!(disable_on_numbers, Setting::NotSet));
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
Reference in New Issue
Block a user