mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-22 14:21:03 +00:00
Compare commits
83 Commits
prototype-
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
8dff899954 | |||
133674751f | |||
9183ad47a7 | |||
6d5b79bc1f | |||
082b171d45 | |||
3e31479c81 | |||
b1016649d0 | |||
408b3a6cd1 | |||
02123a3326 | |||
5644af10ef | |||
389d673bc2 | |||
bba401eb37 | |||
95fcd17373 | |||
ac4bc143c4 | |||
f33a1282f8 | |||
4d5971f343 | |||
ecb5c506b3 | |||
3698aef66b | |||
7f5ab3cef5 | |||
c668043c4f | |||
5a305bfdea | |||
f4dd73ec8c | |||
66dce4600d | |||
fe51ceca6d | |||
88174b8ae4 | |||
ebca29f3de | |||
c793b6ef6d | |||
cbbfff3594 | |||
dbcf50589b | |||
3e5cd027a5 | |||
7468c1cf8d | |||
d4aeff92d0 | |||
e87cb373de | |||
9b76501875 | |||
6247e95dc3 | |||
b3173d0423 | |||
96cc5319c8 | |||
0c7003c5df | |||
a1aa999026 | |||
aa0bbbb246 | |||
a04012c33e | |||
c71b5d09ff | |||
248e22005a | |||
ab43a8a949 | |||
4a8459b799 | |||
442de982a9 | |||
c923adf222 | |||
2dfee2fad5 | |||
4a68e9f6ae | |||
206887c7a2 | |||
2f170fe2d5 | |||
df29ba709a | |||
2dd9dd6d0a | |||
3acfab2eb7 | |||
e1f27de51a | |||
abae31aee0 | |||
70ce0095ea | |||
19137be0ea | |||
a1ea224da9 | |||
87a93ba47d | |||
eaf113ef34 | |||
5ab901dd30 | |||
e5ae337aae | |||
bad46f88d6 | |||
a489b406b4 | |||
02c3d6b265 | |||
b5e4a55af6 | |||
a7e368aaa6 | |||
893200ab87 | |||
aabce52b1b | |||
64079fc894 | |||
8fff5fc281 | |||
4089dd04a5 | |||
cf864a1c2e | |||
0661c86f16 | |||
a6c02f7684 | |||
89e72fab32 | |||
171b41be24 | |||
c26d356a35 | |||
217fbc777f | |||
c2c73c1f25 | |||
7a49a056fa | |||
fd4be26718 |
514
Cargo.lock
generated
514
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -17,7 +17,8 @@ members = [
|
||||
"benchmarks",
|
||||
"fuzzers",
|
||||
"tracing-trace",
|
||||
"xtask", "build-info",
|
||||
"xtask",
|
||||
"build-info",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
|
@ -166,6 +166,7 @@ impl From<KindWithContent> for KindDump {
|
||||
documents_count,
|
||||
allow_index_creation,
|
||||
},
|
||||
KindWithContent::DocumentEdition { .. } => todo!(),
|
||||
KindWithContent::DocumentDeletion { documents_ids, .. } => {
|
||||
KindDump::DocumentDeletion { documents_ids }
|
||||
}
|
||||
@ -256,8 +257,8 @@ pub(crate) mod test {
|
||||
|
||||
pub fn create_test_settings() -> Settings<Checked> {
|
||||
let settings = Settings {
|
||||
displayed_attributes: Setting::Set(vec![S("race"), S("name")]),
|
||||
searchable_attributes: Setting::Set(vec![S("name"), S("race")]),
|
||||
displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(),
|
||||
searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(),
|
||||
filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }),
|
||||
sortable_attributes: Setting::Set(btreeset! { S("age") }),
|
||||
ranking_rules: Setting::NotSet,
|
||||
|
@ -315,8 +315,8 @@ impl From<v5::ResponseError> for v6::ResponseError {
|
||||
impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
|
||||
fn from(settings: v5::Settings<T>) -> Self {
|
||||
v6::Settings {
|
||||
displayed_attributes: settings.displayed_attributes.into(),
|
||||
searchable_attributes: settings.searchable_attributes.into(),
|
||||
displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(),
|
||||
searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(),
|
||||
filterable_attributes: settings.filterable_attributes.into(),
|
||||
sortable_attributes: settings.sortable_attributes.into(),
|
||||
ranking_rules: {
|
||||
|
@ -568,7 +568,7 @@ pub mod tests {
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#);
|
||||
// but it also works with other sequencies
|
||||
// but it also works with other sequences
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,7 @@ time = { version = "0.3.31", features = [
|
||||
"macros",
|
||||
] }
|
||||
tracing = "0.1.40"
|
||||
ureq = "2.9.1"
|
||||
ureq = "2.9.7"
|
||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
@ -24,6 +24,7 @@ enum AutobatchKind {
|
||||
allow_index_creation: bool,
|
||||
primary_key: Option<String>,
|
||||
},
|
||||
DocumentEdition,
|
||||
DocumentDeletion,
|
||||
DocumentDeletionByFilter,
|
||||
DocumentClear,
|
||||
@ -63,6 +64,7 @@ impl From<KindWithContent> for AutobatchKind {
|
||||
primary_key,
|
||||
..
|
||||
} => AutobatchKind::DocumentImport { method, allow_index_creation, primary_key },
|
||||
KindWithContent::DocumentEdition { .. } => AutobatchKind::DocumentEdition,
|
||||
KindWithContent::DocumentDeletion { .. } => AutobatchKind::DocumentDeletion,
|
||||
KindWithContent::DocumentClear { .. } => AutobatchKind::DocumentClear,
|
||||
KindWithContent::DocumentDeletionByFilter { .. } => {
|
||||
@ -98,6 +100,9 @@ pub enum BatchKind {
|
||||
primary_key: Option<String>,
|
||||
operation_ids: Vec<TaskId>,
|
||||
},
|
||||
DocumentEdition {
|
||||
id: TaskId,
|
||||
},
|
||||
DocumentDeletion {
|
||||
deletion_ids: Vec<TaskId>,
|
||||
},
|
||||
@ -199,6 +204,7 @@ impl BatchKind {
|
||||
}),
|
||||
allow_index_creation,
|
||||
),
|
||||
K::DocumentEdition => (Break(BatchKind::DocumentEdition { id: task_id }), false),
|
||||
K::DocumentDeletion => {
|
||||
(Continue(BatchKind::DocumentDeletion { deletion_ids: vec![task_id] }), false)
|
||||
}
|
||||
@ -222,7 +228,7 @@ impl BatchKind {
|
||||
|
||||
match (self, kind) {
|
||||
// We don't batch any of these operations
|
||||
(this, K::IndexCreation | K::IndexUpdate | K::IndexSwap | K::DocumentDeletionByFilter) => Break(this),
|
||||
(this, K::IndexCreation | K::IndexUpdate | K::IndexSwap | K::DocumentEdition | K::DocumentDeletionByFilter) => Break(this),
|
||||
// We must not batch tasks that don't have the same index creation rights if the index doesn't already exists.
|
||||
(this, kind) if !index_already_exists && this.allow_index_creation() == Some(false) && kind.allow_index_creation() == Some(true) => {
|
||||
Break(this)
|
||||
@ -519,6 +525,7 @@ impl BatchKind {
|
||||
| BatchKind::IndexDeletion { .. }
|
||||
| BatchKind::IndexUpdate { .. }
|
||||
| BatchKind::IndexSwap { .. }
|
||||
| BatchKind::DocumentEdition { .. }
|
||||
| BatchKind::DocumentDeletionByFilter { .. },
|
||||
_,
|
||||
) => {
|
||||
|
@ -13,7 +13,7 @@ We can combine the two tasks in a single batch:
|
||||
1. import documents X and Y
|
||||
|
||||
Processing this batch is functionally equivalent to processing the two
|
||||
tasks individally, but should be much faster since we are only performing
|
||||
tasks individually, but should be much faster since we are only performing
|
||||
one indexing operation.
|
||||
*/
|
||||
|
||||
@ -31,7 +31,7 @@ use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::{
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
use meilisearch_types::milli::{self, Filter};
|
||||
use meilisearch_types::milli::{self, Filter, Object};
|
||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
|
||||
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
|
||||
@ -103,6 +103,10 @@ pub(crate) enum IndexOperation {
|
||||
operations: Vec<DocumentOperation>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
DocumentEdition {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
@ -161,7 +165,8 @@ impl Batch {
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid))
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => {
|
||||
IndexOperation::DocumentEdition { task, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { task, .. } => {
|
||||
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
|
||||
}
|
||||
IndexOperation::SettingsAndDocumentOperation {
|
||||
@ -225,6 +230,7 @@ impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { index_uid, .. }
|
||||
| IndexOperation::DocumentEdition { index_uid, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||
| IndexOperation::DocumentClear { index_uid, .. }
|
||||
| IndexOperation::Settings { index_uid, .. }
|
||||
@ -240,6 +246,9 @@ impl fmt::Display for IndexOperation {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentEdition { .. } => {
|
||||
f.write_str("IndexOperation::DocumentEdition")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
@ -292,6 +301,21 @@ impl IndexScheduler {
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
BatchKind::DocumentEdition { id } => {
|
||||
let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
|
||||
match &task.kind {
|
||||
KindWithContent::DocumentEdition { index_uid, .. } => {
|
||||
Ok(Some(Batch::IndexOperation {
|
||||
op: IndexOperation::DocumentEdition {
|
||||
index_uid: index_uid.clone(),
|
||||
task,
|
||||
},
|
||||
must_create_index: false,
|
||||
}))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
BatchKind::DocumentOperation { method, operation_ids, .. } => {
|
||||
let tasks = self.get_existing_tasks(rtxn, operation_ids)?;
|
||||
let primary_key = tasks
|
||||
@ -1334,6 +1358,64 @@ impl IndexScheduler {
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentEdition { mut task, .. } => {
|
||||
let (filter, context, function) =
|
||||
if let KindWithContent::DocumentEdition {
|
||||
filter_expr, context, function, ..
|
||||
} = &task.kind
|
||||
{
|
||||
(filter_expr, context, function)
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let result_count = edit_documents_by_function(
|
||||
index_wtxn,
|
||||
filter,
|
||||
context.clone(),
|
||||
function,
|
||||
self.index_mapper.indexer_config(),
|
||||
self.must_stop_processing.clone(),
|
||||
index,
|
||||
);
|
||||
let (original_filter, context, function) = if let Some(Details::DocumentEdition {
|
||||
original_filter,
|
||||
context,
|
||||
function,
|
||||
..
|
||||
}) = task.details
|
||||
{
|
||||
(original_filter, context, function)
|
||||
} else {
|
||||
// In the case of a `documentDeleteByFilter` the details MUST be set
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match result_count {
|
||||
Ok((deleted_documents, edited_documents)) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentEdition {
|
||||
original_filter,
|
||||
context,
|
||||
function,
|
||||
deleted_documents: Some(deleted_documents),
|
||||
edited_documents: Some(edited_documents),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentEdition {
|
||||
original_filter,
|
||||
context,
|
||||
function,
|
||||
deleted_documents: Some(0),
|
||||
edited_documents: Some(0),
|
||||
});
|
||||
task.error = Some(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(vec![task])
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let filter =
|
||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||
@ -1622,3 +1704,44 @@ fn delete_document_by_filter<'a>(
|
||||
0
|
||||
})
|
||||
}
|
||||
|
||||
fn edit_documents_by_function<'a>(
|
||||
wtxn: &mut RwTxn<'a>,
|
||||
filter: &Option<serde_json::Value>,
|
||||
context: Option<Object>,
|
||||
code: &str,
|
||||
indexer_config: &IndexerConfig,
|
||||
must_stop_processing: MustStopProcessing,
|
||||
index: &'a Index,
|
||||
) -> Result<(u64, u64)> {
|
||||
let candidates = match filter.as_ref().map(Filter::from_json) {
|
||||
Some(Ok(Some(filter))) => filter.evaluate(wtxn, index).map_err(|err| match err {
|
||||
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
|
||||
Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
|
||||
}
|
||||
e => e.into(),
|
||||
})?,
|
||||
None | Some(Ok(None)) => index.documents_ids(wtxn)?,
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| tracing::debug!(update = ?indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, count) = builder.edit_documents(&candidates, context, code)?;
|
||||
builder = new_builder;
|
||||
|
||||
let _ = builder.execute()?;
|
||||
Ok(count.unwrap())
|
||||
}
|
||||
|
@ -178,6 +178,17 @@ fn snapshot_details(d: &Details) -> String {
|
||||
} => {
|
||||
format!("{{ received_documents: {received_documents}, indexed_documents: {indexed_documents:?} }}")
|
||||
}
|
||||
Details::DocumentEdition {
|
||||
deleted_documents,
|
||||
edited_documents,
|
||||
original_filter,
|
||||
context,
|
||||
function,
|
||||
} => {
|
||||
format!(
|
||||
"{{ deleted_documents: {deleted_documents:?}, edited_documents: {edited_documents:?}, context: {context:?}, function: {function:?}, original_filter: {original_filter:?} }}"
|
||||
)
|
||||
}
|
||||
Details::SettingsUpdate { settings } => {
|
||||
format!("{{ settings: {settings:?} }}")
|
||||
}
|
||||
|
@ -3041,6 +3041,7 @@ mod tests {
|
||||
source: Setting::Set(milli::vector::settings::EmbedderSource::Rest),
|
||||
api_key: Setting::Set(S("My super secret")),
|
||||
url: Setting::Set(S("http://localhost:7777")),
|
||||
dimensions: Setting::Set(4),
|
||||
..Default::default()
|
||||
};
|
||||
embedders.insert(S("default"), Setting::Set(embedding_settings));
|
||||
@ -4748,6 +4749,7 @@ mod tests {
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"documentEdition": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
@ -4779,6 +4781,7 @@ mod tests {
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"documentEdition": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
@ -4817,6 +4820,7 @@ mod tests {
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"documentEdition": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
@ -4856,6 +4860,7 @@ mod tests {
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"documentEdition": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
|
@ -7,6 +7,7 @@ expression: task.details
|
||||
"default": {
|
||||
"source": "rest",
|
||||
"apiKey": "MyXXXX...",
|
||||
"dimensions": 4,
|
||||
"url": "http://localhost:7777"
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ expression: embedding_config.embedder_options
|
||||
"Rest": {
|
||||
"api_key": "My super secret",
|
||||
"distribution": null,
|
||||
"dimensions": null,
|
||||
"dimensions": 4,
|
||||
"url": "http://localhost:7777",
|
||||
"query": null,
|
||||
"input_field": [
|
||||
|
@ -7,6 +7,7 @@ expression: task.details
|
||||
"default": {
|
||||
"source": "rest",
|
||||
"apiKey": "MyXXXX...",
|
||||
"dimensions": 4,
|
||||
"url": "http://localhost:7777"
|
||||
}
|
||||
}
|
||||
|
@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
|
||||
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,]
|
||||
|
@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
|
||||
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued []
|
||||
|
@ -238,6 +238,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) {
|
||||
let mut index_uids = vec![];
|
||||
match &mut task.kind {
|
||||
K::DocumentAdditionOrUpdate { index_uid, .. } => index_uids.push(index_uid),
|
||||
K::DocumentEdition { index_uid, .. } => index_uids.push(index_uid),
|
||||
K::DocumentDeletion { index_uid, .. } => index_uids.push(index_uid),
|
||||
K::DocumentDeletionByFilter { index_uid, .. } => index_uids.push(index_uid),
|
||||
K::DocumentClear { index_uid } => index_uids.push(index_uid),
|
||||
@ -408,7 +409,26 @@ impl IndexScheduler {
|
||||
match status {
|
||||
Status::Succeeded => assert!(indexed_documents <= received_documents),
|
||||
Status::Failed | Status::Canceled => assert_eq!(indexed_documents, 0),
|
||||
status => panic!("DocumentAddition can't have an indexed_document set if it's {}", status),
|
||||
status => panic!("DocumentAddition can't have an indexed_documents set if it's {}", status),
|
||||
}
|
||||
}
|
||||
None => {
|
||||
assert!(matches!(status, Status::Enqueued | Status::Processing))
|
||||
}
|
||||
}
|
||||
}
|
||||
Details::DocumentEdition { edited_documents, .. } => {
|
||||
assert_eq!(kind.as_kind(), Kind::DocumentEdition);
|
||||
match edited_documents {
|
||||
Some(edited_documents) => {
|
||||
assert!(matches!(
|
||||
status,
|
||||
Status::Succeeded | Status::Failed | Status::Canceled
|
||||
));
|
||||
match status {
|
||||
Status::Succeeded => (),
|
||||
Status::Failed | Status::Canceled => assert_eq!(edited_documents, 0),
|
||||
status => panic!("DocumentEdition can't have an edited_documents set if it's {}", status),
|
||||
}
|
||||
}
|
||||
None => {
|
||||
|
@ -44,6 +44,7 @@ all-tokenizations = ["milli/all-tokenizations"]
|
||||
|
||||
# chinese specialized tokenization
|
||||
chinese = ["milli/chinese"]
|
||||
chinese-pinyin = ["milli/chinese-pinyin"]
|
||||
# hebrew specialized tokenization
|
||||
hebrew = ["milli/hebrew"]
|
||||
# japanese specialized tokenization
|
||||
@ -56,3 +57,5 @@ greek = ["milli/greek"]
|
||||
khmer = ["milli/khmer"]
|
||||
# allow vietnamese specialized tokenization
|
||||
vietnamese = ["milli/vietnamese"]
|
||||
# force swedish character recomposition
|
||||
swedish-recomposition = ["milli/swedish-recomposition"]
|
||||
|
@ -26,7 +26,7 @@ pub type DeserrQueryParamError<C = BadRequest> = DeserrError<DeserrQueryParam, C
|
||||
|
||||
/// A request deserialization error.
|
||||
///
|
||||
/// The first generic paramater is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
|
||||
/// The first generic parameter is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
|
||||
/// The second generic parameter is the default error code for the deserialization error, in case it is not given.
|
||||
pub struct DeserrError<Format, C: Default + ErrorCode> {
|
||||
pub msg: String,
|
||||
|
@ -3,7 +3,7 @@ use std::convert::Infallible;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::{ControlFlow, Deref};
|
||||
use std::str::FromStr;
|
||||
|
||||
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
|
||||
@ -143,21 +143,13 @@ impl MergeWithError<milli::CriterionError> for DeserrJsonError<InvalidSettingsRa
|
||||
)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct Settings<T> {
|
||||
#[serde(
|
||||
default,
|
||||
serialize_with = "serialize_with_wildcard",
|
||||
skip_serializing_if = "Setting::is_not_set"
|
||||
)]
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)]
|
||||
pub displayed_attributes: Setting<Vec<String>>,
|
||||
pub displayed_attributes: WildcardSetting,
|
||||
|
||||
#[serde(
|
||||
default,
|
||||
serialize_with = "serialize_with_wildcard",
|
||||
skip_serializing_if = "Setting::is_not_set"
|
||||
)]
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)]
|
||||
pub searchable_attributes: Setting<Vec<String>>,
|
||||
pub searchable_attributes: WildcardSetting,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)]
|
||||
@ -251,8 +243,8 @@ impl<T> Settings<T> {
|
||||
impl Settings<Checked> {
|
||||
pub fn cleared() -> Settings<Checked> {
|
||||
Settings {
|
||||
displayed_attributes: Setting::Reset,
|
||||
searchable_attributes: Setting::Reset,
|
||||
displayed_attributes: Setting::Reset.into(),
|
||||
searchable_attributes: Setting::Reset.into(),
|
||||
filterable_attributes: Setting::Reset,
|
||||
sortable_attributes: Setting::Reset,
|
||||
ranking_rules: Setting::Reset,
|
||||
@ -319,7 +311,7 @@ impl Settings<Checked> {
|
||||
|
||||
impl Settings<Unchecked> {
|
||||
pub fn check(self) -> Settings<Checked> {
|
||||
let displayed_attributes = match self.displayed_attributes {
|
||||
let displayed_attributes = match self.displayed_attributes.0 {
|
||||
Setting::Set(fields) => {
|
||||
if fields.iter().any(|f| f == "*") {
|
||||
Setting::Reset
|
||||
@ -330,7 +322,7 @@ impl Settings<Unchecked> {
|
||||
otherwise => otherwise,
|
||||
};
|
||||
|
||||
let searchable_attributes = match self.searchable_attributes {
|
||||
let searchable_attributes = match self.searchable_attributes.0 {
|
||||
Setting::Set(fields) => {
|
||||
if fields.iter().any(|f| f == "*") {
|
||||
Setting::Reset
|
||||
@ -342,8 +334,8 @@ impl Settings<Unchecked> {
|
||||
};
|
||||
|
||||
Settings {
|
||||
displayed_attributes,
|
||||
searchable_attributes,
|
||||
displayed_attributes: displayed_attributes.into(),
|
||||
searchable_attributes: searchable_attributes.into(),
|
||||
filterable_attributes: self.filterable_attributes,
|
||||
sortable_attributes: self.sortable_attributes,
|
||||
ranking_rules: self.ranking_rules,
|
||||
@ -412,13 +404,13 @@ pub fn apply_settings_to_builder(
|
||||
_kind,
|
||||
} = settings;
|
||||
|
||||
match searchable_attributes {
|
||||
match searchable_attributes.deref() {
|
||||
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
|
||||
Setting::Reset => builder.reset_searchable_fields(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
match displayed_attributes {
|
||||
match displayed_attributes.deref() {
|
||||
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
|
||||
Setting::Reset => builder.reset_displayed_fields(),
|
||||
Setting::NotSet => (),
|
||||
@ -690,11 +682,13 @@ pub fn settings(
|
||||
displayed_attributes: match displayed_attributes {
|
||||
Some(attrs) => Setting::Set(attrs),
|
||||
None => Setting::Reset,
|
||||
},
|
||||
}
|
||||
.into(),
|
||||
searchable_attributes: match searchable_attributes {
|
||||
Some(attrs) => Setting::Set(attrs),
|
||||
None => Setting::Reset,
|
||||
},
|
||||
}
|
||||
.into(),
|
||||
filterable_attributes: Setting::Set(filterable_attributes),
|
||||
sortable_attributes: Setting::Set(sortable_attributes),
|
||||
ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
|
||||
@ -848,6 +842,41 @@ impl From<ProximityPrecisionView> for ProximityPrecision {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)]
|
||||
pub struct WildcardSetting(Setting<Vec<String>>);
|
||||
|
||||
impl From<Setting<Vec<String>>> for WildcardSetting {
|
||||
fn from(setting: Setting<Vec<String>>) -> Self {
|
||||
Self(setting)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for WildcardSetting {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
serialize_with_wildcard(&self.0, serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<E: deserr::DeserializeError> Deserr<E> for WildcardSetting {
|
||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||
value: deserr::Value<V>,
|
||||
location: ValuePointerRef<'_>,
|
||||
) -> Result<Self, E> {
|
||||
Ok(Self(Setting::deserialize_from_value(value, location)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for WildcardSetting {
|
||||
type Target = Setting<Vec<String>>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
use super::*;
|
||||
@ -856,8 +885,8 @@ pub(crate) mod test {
|
||||
fn test_setting_check() {
|
||||
// test no changes
|
||||
let settings = Settings {
|
||||
displayed_attributes: Setting::Set(vec![String::from("hello")]),
|
||||
searchable_attributes: Setting::Set(vec![String::from("hello")]),
|
||||
displayed_attributes: Setting::Set(vec![String::from("hello")]).into(),
|
||||
searchable_attributes: Setting::Set(vec![String::from("hello")]).into(),
|
||||
filterable_attributes: Setting::NotSet,
|
||||
sortable_attributes: Setting::NotSet,
|
||||
ranking_rules: Setting::NotSet,
|
||||
@ -883,8 +912,9 @@ pub(crate) mod test {
|
||||
// test wildcard
|
||||
// test no changes
|
||||
let settings = Settings {
|
||||
displayed_attributes: Setting::Set(vec![String::from("*")]),
|
||||
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
|
||||
displayed_attributes: Setting::Set(vec![String::from("*")]).into(),
|
||||
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")])
|
||||
.into(),
|
||||
filterable_attributes: Setting::NotSet,
|
||||
sortable_attributes: Setting::NotSet,
|
||||
ranking_rules: Setting::NotSet,
|
||||
@ -904,7 +934,7 @@ pub(crate) mod test {
|
||||
};
|
||||
|
||||
let checked = settings.check();
|
||||
assert_eq!(checked.displayed_attributes, Setting::Reset);
|
||||
assert_eq!(checked.searchable_attributes, Setting::Reset);
|
||||
assert_eq!(checked.displayed_attributes, Setting::Reset.into());
|
||||
assert_eq!(checked.searchable_attributes, Setting::Reset.into());
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
use milli::Object;
|
||||
use serde::Serialize;
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
@ -54,6 +55,8 @@ pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub indexed_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub edited_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary_key: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub provided_ids: Option<usize>,
|
||||
@ -70,6 +73,10 @@ pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub dump_uid: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub context: Option<Option<Object>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub function: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
pub settings: Option<Box<Settings<Unchecked>>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
@ -86,6 +93,20 @@ impl From<Details> for DetailsView {
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::DocumentEdition {
|
||||
deleted_documents,
|
||||
edited_documents,
|
||||
original_filter,
|
||||
context,
|
||||
function,
|
||||
} => DetailsView {
|
||||
deleted_documents: Some(deleted_documents),
|
||||
edited_documents: Some(edited_documents),
|
||||
original_filter: Some(original_filter),
|
||||
context: Some(context),
|
||||
function: Some(function),
|
||||
..DetailsView::default()
|
||||
},
|
||||
Details::SettingsUpdate { mut settings } => {
|
||||
settings.hide_secrets();
|
||||
DetailsView { settings: Some(settings), ..DetailsView::default() }
|
||||
|
@ -5,6 +5,7 @@ use std::str::FromStr;
|
||||
|
||||
use enum_iterator::Sequence;
|
||||
use milli::update::IndexDocumentsMethod;
|
||||
use milli::Object;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use time::{Duration, OffsetDateTime};
|
||||
@ -48,6 +49,7 @@ impl Task {
|
||||
| TaskDeletion { .. }
|
||||
| IndexSwap { .. } => None,
|
||||
DocumentAdditionOrUpdate { index_uid, .. }
|
||||
| DocumentEdition { index_uid, .. }
|
||||
| DocumentDeletion { index_uid, .. }
|
||||
| DocumentDeletionByFilter { index_uid, .. }
|
||||
| DocumentClear { index_uid }
|
||||
@ -67,7 +69,8 @@ impl Task {
|
||||
pub fn content_uuid(&self) -> Option<Uuid> {
|
||||
match self.kind {
|
||||
KindWithContent::DocumentAdditionOrUpdate { content_file, .. } => Some(content_file),
|
||||
KindWithContent::DocumentDeletion { .. }
|
||||
KindWithContent::DocumentEdition { .. }
|
||||
| KindWithContent::DocumentDeletion { .. }
|
||||
| KindWithContent::DocumentDeletionByFilter { .. }
|
||||
| KindWithContent::DocumentClear { .. }
|
||||
| KindWithContent::SettingsUpdate { .. }
|
||||
@ -94,6 +97,12 @@ pub enum KindWithContent {
|
||||
documents_count: u64,
|
||||
allow_index_creation: bool,
|
||||
},
|
||||
DocumentEdition {
|
||||
index_uid: String,
|
||||
filter_expr: Option<serde_json::Value>,
|
||||
context: Option<milli::Object>,
|
||||
function: String,
|
||||
},
|
||||
DocumentDeletion {
|
||||
index_uid: String,
|
||||
documents_ids: Vec<String>,
|
||||
@ -150,6 +159,7 @@ impl KindWithContent {
|
||||
pub fn as_kind(&self) -> Kind {
|
||||
match self {
|
||||
KindWithContent::DocumentAdditionOrUpdate { .. } => Kind::DocumentAdditionOrUpdate,
|
||||
KindWithContent::DocumentEdition { .. } => Kind::DocumentEdition,
|
||||
KindWithContent::DocumentDeletion { .. } => Kind::DocumentDeletion,
|
||||
KindWithContent::DocumentDeletionByFilter { .. } => Kind::DocumentDeletion,
|
||||
KindWithContent::DocumentClear { .. } => Kind::DocumentDeletion,
|
||||
@ -174,6 +184,7 @@ impl KindWithContent {
|
||||
| TaskCancelation { .. }
|
||||
| TaskDeletion { .. } => vec![],
|
||||
DocumentAdditionOrUpdate { index_uid, .. }
|
||||
| DocumentEdition { index_uid, .. }
|
||||
| DocumentDeletion { index_uid, .. }
|
||||
| DocumentDeletionByFilter { index_uid, .. }
|
||||
| DocumentClear { index_uid }
|
||||
@ -202,6 +213,15 @@ impl KindWithContent {
|
||||
indexed_documents: None,
|
||||
})
|
||||
}
|
||||
KindWithContent::DocumentEdition { index_uid: _, filter_expr, context, function } => {
|
||||
Some(Details::DocumentEdition {
|
||||
deleted_documents: None,
|
||||
edited_documents: None,
|
||||
original_filter: filter_expr.as_ref().map(|v| v.to_string()),
|
||||
context: context.clone(),
|
||||
function: function.clone(),
|
||||
})
|
||||
}
|
||||
KindWithContent::DocumentDeletion { index_uid: _, documents_ids } => {
|
||||
Some(Details::DocumentDeletion {
|
||||
provided_ids: documents_ids.len(),
|
||||
@ -250,6 +270,15 @@ impl KindWithContent {
|
||||
indexed_documents: Some(0),
|
||||
})
|
||||
}
|
||||
KindWithContent::DocumentEdition { index_uid: _, filter_expr, context, function } => {
|
||||
Some(Details::DocumentEdition {
|
||||
deleted_documents: Some(0),
|
||||
edited_documents: Some(0),
|
||||
original_filter: filter_expr.as_ref().map(|v| v.to_string()),
|
||||
context: context.clone(),
|
||||
function: function.clone(),
|
||||
})
|
||||
}
|
||||
KindWithContent::DocumentDeletion { index_uid: _, documents_ids } => {
|
||||
Some(Details::DocumentDeletion {
|
||||
provided_ids: documents_ids.len(),
|
||||
@ -301,6 +330,7 @@ impl From<&KindWithContent> for Option<Details> {
|
||||
indexed_documents: None,
|
||||
})
|
||||
}
|
||||
KindWithContent::DocumentEdition { .. } => None,
|
||||
KindWithContent::DocumentDeletion { .. } => None,
|
||||
KindWithContent::DocumentDeletionByFilter { .. } => None,
|
||||
KindWithContent::DocumentClear { .. } => None,
|
||||
@ -394,6 +424,7 @@ impl std::error::Error for ParseTaskStatusError {}
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub enum Kind {
|
||||
DocumentAdditionOrUpdate,
|
||||
DocumentEdition,
|
||||
DocumentDeletion,
|
||||
SettingsUpdate,
|
||||
IndexCreation,
|
||||
@ -410,6 +441,7 @@ impl Kind {
|
||||
pub fn related_to_one_index(&self) -> bool {
|
||||
match self {
|
||||
Kind::DocumentAdditionOrUpdate
|
||||
| Kind::DocumentEdition
|
||||
| Kind::DocumentDeletion
|
||||
| Kind::SettingsUpdate
|
||||
| Kind::IndexCreation
|
||||
@ -427,6 +459,7 @@ impl Display for Kind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Kind::DocumentAdditionOrUpdate => write!(f, "documentAdditionOrUpdate"),
|
||||
Kind::DocumentEdition => write!(f, "documentEdition"),
|
||||
Kind::DocumentDeletion => write!(f, "documentDeletion"),
|
||||
Kind::SettingsUpdate => write!(f, "settingsUpdate"),
|
||||
Kind::IndexCreation => write!(f, "indexCreation"),
|
||||
@ -454,6 +487,8 @@ impl FromStr for Kind {
|
||||
Ok(Kind::IndexDeletion)
|
||||
} else if kind.eq_ignore_ascii_case("documentAdditionOrUpdate") {
|
||||
Ok(Kind::DocumentAdditionOrUpdate)
|
||||
} else if kind.eq_ignore_ascii_case("documentEdition") {
|
||||
Ok(Kind::DocumentEdition)
|
||||
} else if kind.eq_ignore_ascii_case("documentDeletion") {
|
||||
Ok(Kind::DocumentDeletion)
|
||||
} else if kind.eq_ignore_ascii_case("settingsUpdate") {
|
||||
@ -495,16 +530,50 @@ impl std::error::Error for ParseTaskKindError {}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub enum Details {
|
||||
DocumentAdditionOrUpdate { received_documents: u64, indexed_documents: Option<u64> },
|
||||
SettingsUpdate { settings: Box<Settings<Unchecked>> },
|
||||
IndexInfo { primary_key: Option<String> },
|
||||
DocumentDeletion { provided_ids: usize, deleted_documents: Option<u64> },
|
||||
DocumentDeletionByFilter { original_filter: String, deleted_documents: Option<u64> },
|
||||
ClearAll { deleted_documents: Option<u64> },
|
||||
TaskCancelation { matched_tasks: u64, canceled_tasks: Option<u64>, original_filter: String },
|
||||
TaskDeletion { matched_tasks: u64, deleted_tasks: Option<u64>, original_filter: String },
|
||||
Dump { dump_uid: Option<String> },
|
||||
IndexSwap { swaps: Vec<IndexSwap> },
|
||||
DocumentAdditionOrUpdate {
|
||||
received_documents: u64,
|
||||
indexed_documents: Option<u64>,
|
||||
},
|
||||
DocumentEdition {
|
||||
deleted_documents: Option<u64>,
|
||||
edited_documents: Option<u64>,
|
||||
original_filter: Option<String>,
|
||||
context: Option<Object>,
|
||||
function: String,
|
||||
},
|
||||
SettingsUpdate {
|
||||
settings: Box<Settings<Unchecked>>,
|
||||
},
|
||||
IndexInfo {
|
||||
primary_key: Option<String>,
|
||||
},
|
||||
DocumentDeletion {
|
||||
provided_ids: usize,
|
||||
deleted_documents: Option<u64>,
|
||||
},
|
||||
DocumentDeletionByFilter {
|
||||
original_filter: String,
|
||||
deleted_documents: Option<u64>,
|
||||
},
|
||||
ClearAll {
|
||||
deleted_documents: Option<u64>,
|
||||
},
|
||||
TaskCancelation {
|
||||
matched_tasks: u64,
|
||||
canceled_tasks: Option<u64>,
|
||||
original_filter: String,
|
||||
},
|
||||
TaskDeletion {
|
||||
matched_tasks: u64,
|
||||
deleted_tasks: Option<u64>,
|
||||
original_filter: String,
|
||||
},
|
||||
Dump {
|
||||
dump_uid: Option<String>,
|
||||
},
|
||||
IndexSwap {
|
||||
swaps: Vec<IndexSwap>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Details {
|
||||
@ -514,6 +583,7 @@ impl Details {
|
||||
Self::DocumentAdditionOrUpdate { indexed_documents, .. } => {
|
||||
*indexed_documents = Some(0)
|
||||
}
|
||||
Self::DocumentEdition { edited_documents, .. } => *edited_documents = Some(0),
|
||||
Self::DocumentDeletion { deleted_documents, .. } => *deleted_documents = Some(0),
|
||||
Self::DocumentDeletionByFilter { deleted_documents, .. } => {
|
||||
*deleted_documents = Some(0)
|
||||
|
@ -75,7 +75,7 @@ reqwest = { version = "0.11.23", features = [
|
||||
"rustls-tls",
|
||||
"json",
|
||||
], default-features = false }
|
||||
rustls = "0.21.6"
|
||||
rustls = "0.21.12"
|
||||
rustls-pemfile = "1.0.2"
|
||||
segment = { version = "0.2.3", optional = true }
|
||||
serde = { version = "1.0.195", features = ["derive"] }
|
||||
@ -149,12 +149,14 @@ mini-dashboard = [
|
||||
"zip",
|
||||
]
|
||||
chinese = ["meilisearch-types/chinese"]
|
||||
chinese-pinyin = ["meilisearch-types/chinese-pinyin"]
|
||||
hebrew = ["meilisearch-types/hebrew"]
|
||||
japanese = ["meilisearch-types/japanese"]
|
||||
thai = ["meilisearch-types/thai"]
|
||||
greek = ["meilisearch-types/greek"]
|
||||
khmer = ["meilisearch-types/khmer"]
|
||||
vietnamese = ["meilisearch-types/vietnamese"]
|
||||
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
|
||||
|
@ -7,7 +7,6 @@ use serde_json::Value;
|
||||
|
||||
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
|
||||
use crate::routes::indexes::documents::UpdateDocumentsQuery;
|
||||
use crate::routes::tasks::TasksFilterQuery;
|
||||
use crate::Opt;
|
||||
|
||||
pub struct MockAnalytics {
|
||||
@ -86,6 +85,4 @@ impl Analytics for MockAnalytics {
|
||||
}
|
||||
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {}
|
||||
fn health_seen(&self, _request: &HttpRequest) {}
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ use platform_dirs::AppDirs;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::routes::indexes::documents::UpdateDocumentsQuery;
|
||||
use crate::routes::tasks::TasksFilterQuery;
|
||||
|
||||
// if the analytics feature is disabled
|
||||
// the `SegmentAnalytics` point to the mock instead of the real analytics
|
||||
@ -117,10 +116,4 @@ pub trait Analytics: Sync + Send {
|
||||
index_creation: bool,
|
||||
request: &HttpRequest,
|
||||
);
|
||||
|
||||
// this method should be called to aggregate the get tasks requests.
|
||||
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest);
|
||||
|
||||
// this method should be called to aggregate a add documents request
|
||||
fn health_seen(&self, request: &HttpRequest);
|
||||
}
|
||||
|
@ -33,7 +33,6 @@ use crate::option::{
|
||||
};
|
||||
use crate::routes::indexes::documents::UpdateDocumentsQuery;
|
||||
use crate::routes::indexes::facet_search::FacetSearchQuery;
|
||||
use crate::routes::tasks::TasksFilterQuery;
|
||||
use crate::routes::{create_all_stats, Stats};
|
||||
use crate::search::{
|
||||
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
|
||||
@ -81,8 +80,6 @@ pub enum AnalyticsMsg {
|
||||
AggregateUpdateDocuments(DocumentsAggregator),
|
||||
AggregateGetFetchDocuments(DocumentsFetchAggregator),
|
||||
AggregatePostFetchDocuments(DocumentsFetchAggregator),
|
||||
AggregateTasks(TasksAggregator),
|
||||
AggregateHealth(HealthAggregator),
|
||||
}
|
||||
|
||||
pub struct SegmentAnalytics {
|
||||
@ -152,8 +149,6 @@ impl SegmentAnalytics {
|
||||
update_documents_aggregator: DocumentsAggregator::default(),
|
||||
get_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
|
||||
post_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
|
||||
get_tasks_aggregator: TasksAggregator::default(),
|
||||
health_aggregator: HealthAggregator::default(),
|
||||
});
|
||||
tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone()));
|
||||
|
||||
@ -231,16 +226,6 @@ impl super::Analytics for SegmentAnalytics {
|
||||
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
|
||||
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate));
|
||||
}
|
||||
|
||||
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) {
|
||||
let aggregate = TasksAggregator::from_query(query, request);
|
||||
let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate));
|
||||
}
|
||||
|
||||
fn health_seen(&self, request: &HttpRequest) {
|
||||
let aggregate = HealthAggregator::from_query(request);
|
||||
let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate));
|
||||
}
|
||||
}
|
||||
|
||||
/// This structure represent the `infos` field we send in the analytics.
|
||||
@ -394,8 +379,6 @@ pub struct Segment {
|
||||
update_documents_aggregator: DocumentsAggregator,
|
||||
get_fetch_documents_aggregator: DocumentsFetchAggregator,
|
||||
post_fetch_documents_aggregator: DocumentsFetchAggregator,
|
||||
get_tasks_aggregator: TasksAggregator,
|
||||
health_aggregator: HealthAggregator,
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
@ -458,8 +441,6 @@ impl Segment {
|
||||
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
|
||||
Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg),
|
||||
Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg),
|
||||
Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg),
|
||||
Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg),
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
@ -513,8 +494,6 @@ impl Segment {
|
||||
update_documents_aggregator,
|
||||
get_fetch_documents_aggregator,
|
||||
post_fetch_documents_aggregator,
|
||||
get_tasks_aggregator,
|
||||
health_aggregator,
|
||||
} = self;
|
||||
|
||||
if let Some(get_search) =
|
||||
@ -562,12 +541,6 @@ impl Segment {
|
||||
{
|
||||
let _ = self.batcher.push(post_fetch_documents).await;
|
||||
}
|
||||
if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") {
|
||||
let _ = self.batcher.push(get_tasks).await;
|
||||
}
|
||||
if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") {
|
||||
let _ = self.batcher.push(health).await;
|
||||
}
|
||||
let _ = self.batcher.flush().await;
|
||||
}
|
||||
}
|
||||
@ -1503,176 +1476,6 @@ impl DocumentsDeletionAggregator {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Serialize)]
|
||||
pub struct TasksAggregator {
|
||||
#[serde(skip)]
|
||||
timestamp: Option<OffsetDateTime>,
|
||||
|
||||
// context
|
||||
#[serde(rename = "user-agent")]
|
||||
user_agents: HashSet<String>,
|
||||
|
||||
filtered_by_uid: bool,
|
||||
filtered_by_index_uid: bool,
|
||||
filtered_by_type: bool,
|
||||
filtered_by_status: bool,
|
||||
filtered_by_canceled_by: bool,
|
||||
filtered_by_before_enqueued_at: bool,
|
||||
filtered_by_after_enqueued_at: bool,
|
||||
filtered_by_before_started_at: bool,
|
||||
filtered_by_after_started_at: bool,
|
||||
filtered_by_before_finished_at: bool,
|
||||
filtered_by_after_finished_at: bool,
|
||||
total_received: usize,
|
||||
}
|
||||
|
||||
impl TasksAggregator {
|
||||
pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self {
|
||||
let TasksFilterQuery {
|
||||
limit: _,
|
||||
from: _,
|
||||
uids,
|
||||
index_uids,
|
||||
types,
|
||||
statuses,
|
||||
canceled_by,
|
||||
before_enqueued_at,
|
||||
after_enqueued_at,
|
||||
before_started_at,
|
||||
after_started_at,
|
||||
before_finished_at,
|
||||
after_finished_at,
|
||||
} = query;
|
||||
|
||||
Self {
|
||||
timestamp: Some(OffsetDateTime::now_utc()),
|
||||
user_agents: extract_user_agents(request).into_iter().collect(),
|
||||
filtered_by_uid: uids.is_some(),
|
||||
filtered_by_index_uid: index_uids.is_some(),
|
||||
filtered_by_type: types.is_some(),
|
||||
filtered_by_status: statuses.is_some(),
|
||||
filtered_by_canceled_by: canceled_by.is_some(),
|
||||
filtered_by_before_enqueued_at: before_enqueued_at.is_some(),
|
||||
filtered_by_after_enqueued_at: after_enqueued_at.is_some(),
|
||||
filtered_by_before_started_at: before_started_at.is_some(),
|
||||
filtered_by_after_started_at: after_started_at.is_some(),
|
||||
filtered_by_before_finished_at: before_finished_at.is_some(),
|
||||
filtered_by_after_finished_at: after_finished_at.is_some(),
|
||||
total_received: 1,
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregate one [TasksAggregator] into another.
|
||||
pub fn aggregate(&mut self, other: Self) {
|
||||
let Self {
|
||||
timestamp,
|
||||
user_agents,
|
||||
total_received,
|
||||
filtered_by_uid,
|
||||
filtered_by_index_uid,
|
||||
filtered_by_type,
|
||||
filtered_by_status,
|
||||
filtered_by_canceled_by,
|
||||
filtered_by_before_enqueued_at,
|
||||
filtered_by_after_enqueued_at,
|
||||
filtered_by_before_started_at,
|
||||
filtered_by_after_started_at,
|
||||
filtered_by_before_finished_at,
|
||||
filtered_by_after_finished_at,
|
||||
} = other;
|
||||
|
||||
if self.timestamp.is_none() {
|
||||
self.timestamp = timestamp;
|
||||
}
|
||||
|
||||
// we can't create a union because there is no `into_union` method
|
||||
for user_agent in user_agents {
|
||||
self.user_agents.insert(user_agent);
|
||||
}
|
||||
|
||||
self.filtered_by_uid |= filtered_by_uid;
|
||||
self.filtered_by_index_uid |= filtered_by_index_uid;
|
||||
self.filtered_by_type |= filtered_by_type;
|
||||
self.filtered_by_status |= filtered_by_status;
|
||||
self.filtered_by_canceled_by |= filtered_by_canceled_by;
|
||||
self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at;
|
||||
self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at;
|
||||
self.filtered_by_before_started_at |= filtered_by_before_started_at;
|
||||
self.filtered_by_after_started_at |= filtered_by_after_started_at;
|
||||
self.filtered_by_before_finished_at |= filtered_by_before_finished_at;
|
||||
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
|
||||
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
|
||||
|
||||
self.total_received = self.total_received.saturating_add(total_received);
|
||||
}
|
||||
|
||||
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
|
||||
// if we had no timestamp it means we never encountered any events and
|
||||
// thus we don't need to send this event.
|
||||
let timestamp = self.timestamp?;
|
||||
|
||||
Some(Track {
|
||||
timestamp: Some(timestamp),
|
||||
user: user.clone(),
|
||||
event: event_name.to_string(),
|
||||
properties: serde_json::to_value(self).ok()?,
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Serialize)]
|
||||
pub struct HealthAggregator {
|
||||
#[serde(skip)]
|
||||
timestamp: Option<OffsetDateTime>,
|
||||
|
||||
// context
|
||||
#[serde(rename = "user-agent")]
|
||||
user_agents: HashSet<String>,
|
||||
|
||||
#[serde(rename = "requests.total_received")]
|
||||
total_received: usize,
|
||||
}
|
||||
|
||||
impl HealthAggregator {
|
||||
pub fn from_query(request: &HttpRequest) -> Self {
|
||||
Self {
|
||||
timestamp: Some(OffsetDateTime::now_utc()),
|
||||
user_agents: extract_user_agents(request).into_iter().collect(),
|
||||
total_received: 1,
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregate one [HealthAggregator] into another.
|
||||
pub fn aggregate(&mut self, other: Self) {
|
||||
let Self { timestamp, user_agents, total_received } = other;
|
||||
|
||||
if self.timestamp.is_none() {
|
||||
self.timestamp = timestamp;
|
||||
}
|
||||
|
||||
// we can't create a union because there is no `into_union` method
|
||||
for user_agent in user_agents {
|
||||
self.user_agents.insert(user_agent);
|
||||
}
|
||||
self.total_received = self.total_received.saturating_add(total_received);
|
||||
}
|
||||
|
||||
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
|
||||
// if we had no timestamp it means we never encountered any events and
|
||||
// thus we don't need to send this event.
|
||||
let timestamp = self.timestamp?;
|
||||
|
||||
Some(Track {
|
||||
timestamp: Some(timestamp),
|
||||
user: user.clone(),
|
||||
event: event_name.to_string(),
|
||||
properties: serde_json::to_value(self).ok()?,
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Serialize)]
|
||||
pub struct DocumentsFetchAggregator {
|
||||
#[serde(skip)]
|
||||
|
@ -59,10 +59,12 @@ where
|
||||
let request_path = req.path();
|
||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||
if is_registered_resource {
|
||||
let request_pattern = req.match_pattern();
|
||||
let metric_path = request_pattern.as_ref().map_or(request_path, String::as_str);
|
||||
let request_method = req.method().to_string();
|
||||
histogram_timer = Some(
|
||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.with_label_values(&[&request_method, metric_path])
|
||||
.start_timer(),
|
||||
);
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError};
|
||||
use clap::Parser;
|
||||
use meilisearch_types::features::InstanceTogglableFeatures;
|
||||
use meilisearch_types::milli::update::IndexerConfig;
|
||||
use meilisearch_types::milli::ThreadPoolNoAbortBuilder;
|
||||
use rustls::server::{
|
||||
AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache,
|
||||
};
|
||||
@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
|
||||
let thread_pool = rayon::ThreadPoolBuilder::new()
|
||||
let thread_pool = ThreadPoolNoAbortBuilder::new()
|
||||
.thread_name(|index| format!("indexing-thread:{index}"))
|
||||
.num_threads(*other.max_indexing_threads)
|
||||
.build()?;
|
||||
|
@ -81,6 +81,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
web::resource("/delete-batch").route(web::post().to(SeqHandler(delete_documents_batch))),
|
||||
)
|
||||
.service(web::resource("/delete").route(web::post().to(SeqHandler(delete_documents_by_filter))))
|
||||
.service(web::resource("/edit").route(web::post().to(SeqHandler(edit_documents_by_function))))
|
||||
.service(web::resource("/fetch").route(web::post().to(SeqHandler(documents_by_query_post))))
|
||||
.service(
|
||||
web::resource("/{document_id}")
|
||||
@ -553,6 +554,66 @@ pub async fn delete_documents_by_filter(
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct DocumentEditionByFunction {
|
||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
|
||||
filter: Option<Value>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
|
||||
context: Option<Value>,
|
||||
#[deserr(error = DeserrJsonError<InvalidDocumentFilter>, missing_field_error = DeserrJsonError::missing_document_filter)]
|
||||
function: String,
|
||||
}
|
||||
|
||||
pub async fn edit_documents_by_function(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<DocumentEditionByFunction, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
_analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Edit documents by function");
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
let index_uid = index_uid.into_inner();
|
||||
let DocumentEditionByFunction { filter, context, function } = body.into_inner();
|
||||
|
||||
// analytics.delete_documents(DocumentDeletionKind::PerFilter, &req);
|
||||
|
||||
let engine = milli::rhai::Engine::new();
|
||||
if let Err(e) = engine.compile(&function) {
|
||||
return Err(ResponseError::from_msg(e.to_string(), Code::BadRequest));
|
||||
}
|
||||
|
||||
if let Some(ref filter) = filter {
|
||||
// we ensure the filter is well formed before enqueuing it
|
||||
|| -> Result<_, ResponseError> {
|
||||
Ok(crate::search::parse_filter(filter)?.ok_or(MeilisearchHttpError::EmptyFilter)?)
|
||||
}()
|
||||
// and whatever was the error, the error code should always be an InvalidDocumentFilter
|
||||
.map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?;
|
||||
}
|
||||
let task = KindWithContent::DocumentEdition {
|
||||
index_uid,
|
||||
filter_expr: filter,
|
||||
context: context.map(|v| match v {
|
||||
serde_json::Value::Object(m) => m,
|
||||
_ => panic!("The context must be an Object"),
|
||||
}),
|
||||
function,
|
||||
};
|
||||
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Delete documents by filter");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
pub async fn clear_all_documents(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
|
@ -269,12 +269,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
|
||||
pub async fn get_index_stats(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req));
|
||||
|
||||
let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?);
|
||||
|
||||
debug!(returns = ?stats, "Get index stats");
|
||||
|
@ -137,10 +137,8 @@ macro_rules! make_setting_route {
|
||||
let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?;
|
||||
|
||||
debug!(returns = ?settings, "Update settings");
|
||||
let mut json = serde_json::json!(&settings);
|
||||
let val = json[$camelcase_attr].take();
|
||||
|
||||
Ok(HttpResponse::Ok().json(val))
|
||||
Ok(HttpResponse::Ok().json(settings.$attr))
|
||||
}
|
||||
|
||||
pub fn resources() -> Resource {
|
||||
|
@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError};
|
||||
use meilisearch_types::settings::{Settings, Unchecked};
|
||||
use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::search_queue::SearchQueue;
|
||||
@ -296,10 +294,7 @@ pub struct Stats {
|
||||
async fn get_stats(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
|
||||
auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req));
|
||||
let filters = index_scheduler.filters();
|
||||
|
||||
let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?;
|
||||
@ -355,11 +350,7 @@ struct VersionResponse {
|
||||
|
||||
async fn get_version(
|
||||
_index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> HttpResponse {
|
||||
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
|
||||
|
||||
let build_info = build_info::BuildInfo::from_build();
|
||||
|
||||
HttpResponse::Ok().json(VersionResponse {
|
||||
@ -376,21 +367,11 @@ async fn get_version(
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct KeysResponse {
|
||||
private: Option<String>,
|
||||
public: Option<String>,
|
||||
}
|
||||
|
||||
pub async fn get_health(
|
||||
req: HttpRequest,
|
||||
index_scheduler: Data<IndexScheduler>,
|
||||
auth_controller: Data<AuthController>,
|
||||
search_queue: Data<SearchQueue>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.health_seen(&req);
|
||||
|
||||
search_queue.health().unwrap();
|
||||
index_scheduler.health().unwrap();
|
||||
auth_controller.health().unwrap();
|
||||
|
@ -270,12 +270,8 @@ pub struct AllTasks {
|
||||
async fn get_tasks(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
|
||||
params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let mut params = params.into_inner();
|
||||
analytics.get_tasks(¶ms, &req);
|
||||
|
||||
// We +1 just to know if there is more after this "page" or not.
|
||||
params.limit.0 = params.limit.0.saturating_add(1);
|
||||
let limit = params.limit.0;
|
||||
@ -298,8 +294,6 @@ async fn get_tasks(
|
||||
async fn get_task(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
|
||||
task_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let task_uid_string = task_uid.into_inner();
|
||||
|
||||
@ -310,8 +304,6 @@ async fn get_task(
|
||||
}
|
||||
};
|
||||
|
||||
analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req));
|
||||
|
||||
let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() };
|
||||
let filters = index_scheduler.filters();
|
||||
let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?;
|
||||
@ -599,7 +591,7 @@ mod tests {
|
||||
let err = deserr_query_params::<TaskDeletionOrCancelationQuery>(params).unwrap_err();
|
||||
snapshot!(meili_snap::json_string!(err), @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"code": "invalid_task_types",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_task_types"
|
||||
|
@ -1,3 +1,4 @@
|
||||
use core::fmt;
|
||||
use std::cmp::min;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
use std::str::FromStr;
|
||||
@ -39,7 +40,7 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
|
||||
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
|
||||
pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5);
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
|
||||
#[derive(Clone, Default, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct SearchQuery {
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
@ -88,6 +89,110 @@ pub struct SearchQuery {
|
||||
pub attributes_to_search_on: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
// Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum.
|
||||
// - Only what IS used, we know everything else is set to None so there is no need to print it
|
||||
// - Re-order the most important field to debug first
|
||||
impl fmt::Debug for SearchQuery {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let Self {
|
||||
q,
|
||||
vector,
|
||||
hybrid,
|
||||
offset,
|
||||
limit,
|
||||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve,
|
||||
attributes_to_crop,
|
||||
crop_length,
|
||||
attributes_to_highlight,
|
||||
show_matches_position,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
filter,
|
||||
sort,
|
||||
facets,
|
||||
highlight_pre_tag,
|
||||
highlight_post_tag,
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
} = self;
|
||||
|
||||
let mut debug = f.debug_struct("SearchQuery");
|
||||
|
||||
// First, everything related to the number of documents to retrieve
|
||||
debug.field("limit", &limit).field("offset", &offset);
|
||||
if let Some(page) = page {
|
||||
debug.field("page", &page);
|
||||
}
|
||||
if let Some(hits_per_page) = hits_per_page {
|
||||
debug.field("hits_per_page", &hits_per_page);
|
||||
}
|
||||
|
||||
// Then, everything related to the queries
|
||||
if let Some(q) = q {
|
||||
debug.field("q", &q);
|
||||
}
|
||||
if let Some(v) = vector {
|
||||
if v.len() < 10 {
|
||||
debug.field("vector", &v);
|
||||
} else {
|
||||
debug.field(
|
||||
"vector",
|
||||
&format!("[{}, {}, {}, ... {} dimensions]", v[0], v[1], v[2], v.len()),
|
||||
);
|
||||
}
|
||||
}
|
||||
if let Some(hybrid) = hybrid {
|
||||
debug.field("hybrid", &hybrid);
|
||||
}
|
||||
if let Some(attributes_to_search_on) = attributes_to_search_on {
|
||||
debug.field("attributes_to_search_on", &attributes_to_search_on);
|
||||
}
|
||||
if let Some(filter) = filter {
|
||||
debug.field("filter", &filter);
|
||||
}
|
||||
if let Some(sort) = sort {
|
||||
debug.field("sort", &sort);
|
||||
}
|
||||
if let Some(facets) = facets {
|
||||
debug.field("facets", &facets);
|
||||
}
|
||||
debug.field("matching_strategy", &matching_strategy);
|
||||
|
||||
// Then everything related to the formatting
|
||||
debug.field("crop_length", &crop_length);
|
||||
if *show_matches_position {
|
||||
debug.field("show_matches_position", show_matches_position);
|
||||
}
|
||||
if *show_ranking_score {
|
||||
debug.field("show_ranking_score", show_ranking_score);
|
||||
}
|
||||
if *show_ranking_score_details {
|
||||
debug.field("self.show_ranking_score_details", show_ranking_score_details);
|
||||
}
|
||||
debug.field("crop_length", &crop_length);
|
||||
if let Some(facets) = facets {
|
||||
debug.field("facets", &facets);
|
||||
}
|
||||
if let Some(attributes_to_retrieve) = attributes_to_retrieve {
|
||||
debug.field("attributes_to_retrieve", &attributes_to_retrieve);
|
||||
}
|
||||
if let Some(attributes_to_crop) = attributes_to_crop {
|
||||
debug.field("attributes_to_crop", &attributes_to_crop);
|
||||
}
|
||||
if let Some(attributes_to_highlight) = attributes_to_highlight {
|
||||
debug.field("attributes_to_highlight", &attributes_to_highlight);
|
||||
}
|
||||
debug.field("highlight_pre_tag", &highlight_pre_tag);
|
||||
debug.field("highlight_post_tag", &highlight_post_tag);
|
||||
debug.field("crop_marker", &crop_marker);
|
||||
|
||||
debug.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct HybridQuery {
|
||||
@ -370,7 +475,7 @@ pub struct SearchHit {
|
||||
pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Clone, PartialEq)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SearchResult {
|
||||
pub hits: Vec<SearchHit>,
|
||||
@ -393,6 +498,46 @@ pub struct SearchResult {
|
||||
pub used_negative_operator: bool,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SearchResult {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let SearchResult {
|
||||
hits,
|
||||
query,
|
||||
processing_time_ms,
|
||||
hits_info,
|
||||
facet_distribution,
|
||||
facet_stats,
|
||||
semantic_hit_count,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
} = self;
|
||||
|
||||
let mut debug = f.debug_struct("SearchResult");
|
||||
// The most important thing when looking at a search result is the time it took to process
|
||||
debug.field("processing_time_ms", &processing_time_ms);
|
||||
debug.field("hits", &format!("[{} hits returned]", hits.len()));
|
||||
debug.field("query", &query);
|
||||
debug.field("hits_info", &hits_info);
|
||||
if *used_negative_operator {
|
||||
debug.field("used_negative_operator", used_negative_operator);
|
||||
}
|
||||
if *degraded {
|
||||
debug.field("degraded", degraded);
|
||||
}
|
||||
if let Some(facet_distribution) = facet_distribution {
|
||||
debug.field("facet_distribution", &facet_distribution);
|
||||
}
|
||||
if let Some(facet_stats) = facet_stats {
|
||||
debug.field("facet_stats", &facet_stats);
|
||||
}
|
||||
if let Some(semantic_hit_count) = semantic_hit_count {
|
||||
debug.field("semantic_hit_count", &semantic_hit_count);
|
||||
}
|
||||
|
||||
debug.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SearchResultWithIndex {
|
||||
|
@ -113,7 +113,8 @@ async fn secrets_are_hidden_in_settings() {
|
||||
"default": {
|
||||
"source": "rest",
|
||||
"url": "https://localhost:7777",
|
||||
"apiKey": "My super secret value you will never guess"
|
||||
"apiKey": "My super secret value you will never guess",
|
||||
"dimensions": 4,
|
||||
}
|
||||
}
|
||||
}))
|
||||
@ -184,6 +185,7 @@ async fn secrets_are_hidden_in_settings() {
|
||||
"default": {
|
||||
"source": "rest",
|
||||
"apiKey": "My suXXXXXX...",
|
||||
"dimensions": 4,
|
||||
"documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
|
||||
"url": "https://localhost:7777",
|
||||
"query": null,
|
||||
@ -211,6 +213,7 @@ async fn secrets_are_hidden_in_settings() {
|
||||
"default": {
|
||||
"source": "rest",
|
||||
"apiKey": "My suXXXXXX...",
|
||||
"dimensions": 4,
|
||||
"url": "https://localhost:7777"
|
||||
}
|
||||
}
|
||||
|
@ -97,7 +97,7 @@ async fn task_bad_types() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"code": "invalid_task_types",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_task_types"
|
||||
@ -108,7 +108,7 @@ async fn task_bad_types() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"code": "invalid_task_types",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_task_types"
|
||||
@ -119,7 +119,7 @@ async fn task_bad_types() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.",
|
||||
"code": "invalid_task_types",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_task_types"
|
||||
|
@ -129,7 +129,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Sucessfully deleted {count} content files from disk!");
|
||||
eprintln!("Successfully deleted {count} content files from disk!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ bincode = "1.3.3"
|
||||
bstr = "1.9.0"
|
||||
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.5.0"
|
||||
charabia = { version = "0.8.8", default-features = false }
|
||||
charabia = { version = "0.8.10", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.11"
|
||||
deserr = "0.6.1"
|
||||
@ -26,7 +26,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.5", default-features = false, features = [
|
||||
grenad = { version = "0.4.6", default-features = false, features = [
|
||||
"rayon",
|
||||
"tempfile",
|
||||
] }
|
||||
@ -85,8 +85,9 @@ liquid = "0.26.4"
|
||||
arroy = "0.2.0"
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.40"
|
||||
ureq = { version = "2.9.6", features = ["json"] }
|
||||
ureq = { version = "2.9.7", features = ["json"] }
|
||||
url = "2.5.0"
|
||||
rhai = { version = "1.18.0", features = ["serde", "no_module", "no_custom_syntax"] }
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.39", default-features = false }
|
||||
@ -115,6 +116,7 @@ lmdb-posix-sem = ["heed/posix-sem"]
|
||||
|
||||
# allow chinese specialized tokenization
|
||||
chinese = ["charabia/chinese"]
|
||||
chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"]
|
||||
|
||||
# allow hebrew specialized tokenization
|
||||
hebrew = ["charabia/hebrew"]
|
||||
@ -135,7 +137,11 @@ greek = ["charabia/greek"]
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["charabia/khmer"]
|
||||
|
||||
# allow vietnamese specialized tokenization
|
||||
vietnamese = ["charabia/vietnamese"]
|
||||
|
||||
# force swedish character recomposition
|
||||
swedish-recomposition = ["charabia/swedish-recomposition"]
|
||||
|
||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||
cuda = ["candle-core/cuda"]
|
||||
|
@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
||||
"string" => (field_name, AllowedType::String),
|
||||
"boolean" => (field_name, AllowedType::Boolean),
|
||||
"number" => (field_name, AllowedType::Number),
|
||||
// if the pattern isn't reconized, we keep the whole field.
|
||||
// if the pattern isn't recognized, we keep the whole field.
|
||||
_otherwise => (header, AllowedType::String),
|
||||
},
|
||||
None => (header, AllowedType::String),
|
||||
|
@ -9,6 +9,7 @@ use serde_json::Value;
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::documents::{self, DocumentsBatchCursorError};
|
||||
use crate::thread_pool_no_abort::PanicCatched;
|
||||
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
|
||||
|
||||
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
||||
@ -39,17 +40,19 @@ pub enum InternalError {
|
||||
Fst(#[from] fst::Error),
|
||||
#[error(transparent)]
|
||||
DocumentsError(#[from] documents::Error),
|
||||
#[error("Invalid compression type have been specified to grenad.")]
|
||||
#[error("Invalid compression type have been specified to grenad")]
|
||||
GrenadInvalidCompressionType,
|
||||
#[error("Invalid grenad file with an invalid version format.")]
|
||||
#[error("Invalid grenad file with an invalid version format")]
|
||||
GrenadInvalidFormatVersion,
|
||||
#[error("Invalid merge while processing {process}.")]
|
||||
#[error("Invalid merge while processing {process}")]
|
||||
IndexingMergingKeys { process: &'static str },
|
||||
#[error("{}", HeedError::InvalidDatabaseTyping)]
|
||||
InvalidDatabaseTyping,
|
||||
#[error(transparent)]
|
||||
RayonThreadPool(#[from] ThreadPoolBuildError),
|
||||
#[error(transparent)]
|
||||
PanicInThreadPool(#[from] PanicCatched),
|
||||
#[error(transparent)]
|
||||
SerdeJson(#[from] serde_json::Error),
|
||||
#[error(transparent)]
|
||||
Serialization(#[from] SerializationError),
|
||||
@ -57,9 +60,9 @@ pub enum InternalError {
|
||||
Store(#[from] MdbError),
|
||||
#[error(transparent)]
|
||||
Utf8(#[from] str::Utf8Error),
|
||||
#[error("An indexation process was explicitly aborted.")]
|
||||
#[error("An indexation process was explicitly aborted")]
|
||||
AbortedIndexation,
|
||||
#[error("The matching words list contains at least one invalid member.")]
|
||||
#[error("The matching words list contains at least one invalid member")]
|
||||
InvalidMatchingWords,
|
||||
#[error(transparent)]
|
||||
ArroyError(#[from] arroy::Error),
|
||||
|
@ -678,6 +678,23 @@ impl Index {
|
||||
.get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY)
|
||||
}
|
||||
|
||||
/// Identical to `user_defined_searchable_fields`, but returns ids instead.
|
||||
pub fn user_defined_searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> {
|
||||
match self.user_defined_searchable_fields(rtxn)? {
|
||||
Some(fields) => {
|
||||
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||
let mut fields_ids = Vec::new();
|
||||
for name in fields {
|
||||
if let Some(field_id) = fields_ids_map.id(name) {
|
||||
fields_ids.push(field_id);
|
||||
}
|
||||
}
|
||||
Ok(Some(fields_ids))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/* filterable fields */
|
||||
|
||||
/// Writes the filterable fields names in the database.
|
||||
@ -824,11 +841,11 @@ impl Index {
|
||||
|
||||
/// Identical to `user_defined_faceted_fields`, but returns ids instead.
|
||||
pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
|
||||
let fields = self.faceted_fields(rtxn)?;
|
||||
let fields = self.user_defined_faceted_fields(rtxn)?;
|
||||
let fields_ids_map = self.fields_ids_map(rtxn)?;
|
||||
|
||||
let mut fields_ids = HashSet::new();
|
||||
for name in fields.into_iter() {
|
||||
for name in fields {
|
||||
if let Some(field_id) = fields_ids_map.id(&name) {
|
||||
fields_ids.insert(field_id);
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ pub mod prompt;
|
||||
pub mod proximity;
|
||||
pub mod score_details;
|
||||
mod search;
|
||||
mod thread_pool_no_abort;
|
||||
pub mod update;
|
||||
pub mod vector;
|
||||
|
||||
@ -42,7 +43,8 @@ pub use search::new::{
|
||||
SearchLogger, VisualSearchLogger,
|
||||
};
|
||||
use serde_json::Value;
|
||||
pub use {charabia as tokenizer, heed};
|
||||
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
pub use {charabia as tokenizer, heed, rhai};
|
||||
|
||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
||||
@ -128,7 +130,7 @@ impl fmt::Debug for TimeBudget {
|
||||
|
||||
impl Default for TimeBudget {
|
||||
fn default() -> Self {
|
||||
Self::new(std::time::Duration::from_millis(150))
|
||||
Self::new(std::time::Duration::from_millis(1500))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,6 +97,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
) -> heed::Result<()> {
|
||||
match facet_type {
|
||||
FacetType::Number => {
|
||||
let mut lexicographic_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let distribution_prelength = distribution.len();
|
||||
@ -111,14 +112,17 @@ impl<'a> FacetDistribution<'a> {
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, value), ()) = result?;
|
||||
*distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||
*lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||
|
||||
if distribution.len() - distribution_prelength == self.max_values_per_facet
|
||||
if lexicographic_distribution.len() - distribution_prelength
|
||||
== self.max_values_per_facet
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
distribution.extend(lexicographic_distribution);
|
||||
}
|
||||
FacetType::String => {
|
||||
let mut normalized_distribution = BTreeMap::new();
|
||||
|
@ -42,7 +42,7 @@ fn facet_number_values<'a>(
|
||||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
|
@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> {
|
||||
for (token_position, word_position, word) in words_positions {
|
||||
partial = match partial.match_token(word) {
|
||||
// token matches the partial match, but the match is not full,
|
||||
// we temporarly save the current token then we try to match the next one.
|
||||
// we temporarily save the current token then we try to match the next one.
|
||||
Some(MatchType::Partial(partial)) => {
|
||||
potential_matches.push((token_position, word_position, partial.char_len()));
|
||||
partial
|
||||
@ -722,7 +722,7 @@ mod tests {
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with diferent density.
|
||||
// Text containing matches with different density.
|
||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
|
@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens(
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// as we are evaluating a negative operator we put the phrase
|
||||
// in the negative one *but* we don't reset the negative operator
|
||||
// as we are immediatly starting a new negative phrase.
|
||||
// as we are immediately starting a new negative phrase.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
|
69
milli/src/thread_pool_no_abort.rs
Normal file
69
milli/src/thread_pool_no_abort.rs
Normal file
@ -0,0 +1,69 @@
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
use thiserror::Error;
|
||||
|
||||
/// A rayon ThreadPool wrapper that can catch panics in the pool
|
||||
/// and modifies the install function accordingly.
|
||||
#[derive(Debug)]
|
||||
pub struct ThreadPoolNoAbort {
|
||||
thread_pool: ThreadPool,
|
||||
/// Set to true if the thread pool catched a panic.
|
||||
pool_catched_panic: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
impl ThreadPoolNoAbort {
|
||||
pub fn install<OP, R>(&self, op: OP) -> Result<R, PanicCatched>
|
||||
where
|
||||
OP: FnOnce() -> R + Send,
|
||||
R: Send,
|
||||
{
|
||||
let output = self.thread_pool.install(op);
|
||||
// While reseting the pool panic catcher we return an error if we catched one.
|
||||
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
|
||||
Err(PanicCatched)
|
||||
} else {
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_num_threads(&self) -> usize {
|
||||
self.thread_pool.current_num_threads()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("A panic occured. Read the logs to find more information about it")]
|
||||
pub struct PanicCatched;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder);
|
||||
|
||||
impl ThreadPoolNoAbortBuilder {
|
||||
pub fn new() -> ThreadPoolNoAbortBuilder {
|
||||
ThreadPoolNoAbortBuilder::default()
|
||||
}
|
||||
|
||||
pub fn thread_name<F>(mut self, closure: F) -> Self
|
||||
where
|
||||
F: FnMut(usize) -> String + 'static,
|
||||
{
|
||||
self.0 = self.0.thread_name(closure);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder {
|
||||
self.0 = self.0.num_threads(num_threads);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(mut self) -> Result<ThreadPoolNoAbort, rayon::ThreadPoolBuildError> {
|
||||
let pool_catched_panic = Arc::new(AtomicBool::new(false));
|
||||
self.0 = self.0.panic_handler({
|
||||
let catched_panic = pool_catched_panic.clone();
|
||||
move |_result| catched_panic.store(true, Ordering::SeqCst)
|
||||
});
|
||||
Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic })
|
||||
}
|
||||
}
|
@ -71,8 +71,8 @@ pub enum DelAddOperation {
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: obkv::KvReader<K>,
|
||||
addition: obkv::KvReader<K>,
|
||||
deletion: &obkv::KvReader<K>,
|
||||
addition: &obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
|
@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner {
|
||||
ModificationResult::Expand | ModificationResult::Reduce { .. }
|
||||
)
|
||||
{
|
||||
// if any modification occured, insert it in the database.
|
||||
// if any modification occurred, insert it in the database.
|
||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||
Ok(insertion_key_modification)
|
||||
} else {
|
||||
|
@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
@ -12,6 +12,7 @@ use serde_json::Value;
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||
@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R
|
||||
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
@ -36,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
@ -56,8 +55,37 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||
let tokenizer = builder.build();
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> = settings_diff
|
||||
.old
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut del_builder = tokenizer_builder(
|
||||
old_stop_words,
|
||||
old_separators.as_deref(),
|
||||
old_dictionary.as_deref(),
|
||||
None,
|
||||
);
|
||||
let del_tokenizer = del_builder.build();
|
||||
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
.new
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut add_builder = tokenizer_builder(
|
||||
new_stop_words,
|
||||
new_separators.as_deref(),
|
||||
new_dictionary.as_deref(),
|
||||
None,
|
||||
);
|
||||
let add_tokenizer = add_builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
@ -69,7 +97,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -85,11 +113,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// deletions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
@ -99,11 +124,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// additions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
@ -118,8 +140,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
KvReader::<FieldId>::new(add_obkv),
|
||||
&KvReader::<FieldId>::new(del_obkv),
|
||||
&KvReader::<FieldId>::new(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
@ -160,8 +182,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
@ -206,14 +229,10 @@ fn tokenizer_builder<'a>(
|
||||
|
||||
/// Extract words mapped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
@ -222,7 +241,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&settings.searchable_fields_ids,
|
||||
tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
@ -246,12 +265,15 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
Some(&script_language),
|
||||
);
|
||||
let stop_words = settings.stop_words.as_ref();
|
||||
let separators: Option<Vec<_>> = settings
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder =
|
||||
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
@ -259,7 +281,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&settings.searchable_fields_ids,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
@ -276,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
|
@ -10,6 +10,7 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
@ -20,6 +21,7 @@ use crate::Result;
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
@ -20,6 +20,7 @@ use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
@ -36,14 +37,14 @@ pub struct ExtractedFacetValues {
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
puffin::profile_function!();
|
||||
@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
|
||||
let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
|
||||
if delete_faceted || add_faceted {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
|
||||
{
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
|
@ -10,6 +10,7 @@ use super::helpers::{
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30;
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -17,8 +17,9 @@ use crate::error::UserError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors};
|
||||
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
@ -71,12 +72,15 @@ impl VectorStateDelta {
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
field_id_map: &FieldsIdsMap,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
prompt: &Prompt,
|
||||
embedder_name: &str,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let mut manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let vectors_fid = field_id_map.id("_vectors");
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let vectors_field = vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_maps(obkv, document_id))
|
||||
.transpose()?;
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id("_vectors");
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
|
||||
|
||||
let (del_map, add_map) = vectors_field.unzip();
|
||||
let del_map = del_map.flatten();
|
||||
let add_map = add_map.flatten();
|
||||
let new_vectors_fid = new_fields_ids_map.id("_vectors");
|
||||
let vectors_field = {
|
||||
let del = old_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add = new_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
(del, add)
|
||||
};
|
||||
|
||||
let (del_map, add_map) = vectors_field;
|
||||
|
||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
field_id_map,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
@ -182,10 +198,16 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt =
|
||||
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default();
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?;
|
||||
if old_prompt != new_prompt {
|
||||
let old_prompt = Some(prompt)
|
||||
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
.filter(|_| !settings_diff.reindex_vectors())
|
||||
.map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
||||
});
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
@ -207,6 +229,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
&mut manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
settings_diff,
|
||||
)?;
|
||||
}
|
||||
|
||||
@ -220,15 +243,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
})
|
||||
}
|
||||
|
||||
fn to_vector_maps(
|
||||
obkv: KvReaderDelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
|
||||
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
|
||||
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
|
||||
Ok((del, add))
|
||||
}
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
@ -256,10 +270,15 @@ fn push_vectors_diff(
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
if must_remove {
|
||||
if must_remove
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
&& !settings_diff.reindex_vectors()
|
||||
{
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
}
|
||||
@ -287,12 +306,16 @@ fn push_vectors_diff(
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
if !settings_diff.reindex_vectors() {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
@ -339,7 +362,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
request_threads: &rayon::ThreadPool,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
|
@ -1,20 +1,23 @@
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::sorter_into_reader;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
@ -27,7 +30,7 @@ use crate::{DocumentId, FieldId, Result};
|
||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
@ -43,7 +46,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
@ -85,13 +88,19 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
@ -100,31 +109,45 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
max_memory.map(|m| m / 3),
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||
let mut buffer = Vec::new();
|
||||
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
} else {
|
||||
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
if delete_from_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
if add_in_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -178,3 +201,45 @@ fn words_into_sorter(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn docids_into_writers<W>(
|
||||
word: &str,
|
||||
deletions: &RoaringBitmap,
|
||||
additions: &RoaringBitmap,
|
||||
writer: &mut grenad::Writer<W>,
|
||||
) -> Result<()>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
if deletions == additions {
|
||||
// if the same value is deleted and added, do nothing.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Write each value in the same KvDelAdd before inserting it in the final writer.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
// deletions:
|
||||
if !deletions.is_empty() && !deletions.is_subset(additions) {
|
||||
obkv.insert(
|
||||
DelAdd::Deletion,
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
obkv.insert(
|
||||
DelAdd::Addition,
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -11,8 +11,9 @@ use super::helpers::{
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
@ -23,8 +24,21 @@ use crate::{DocumentId, Result};
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if !any_deletion && !any_addition {
|
||||
let writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
return writer_into_reader(writer);
|
||||
}
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@ -77,6 +91,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
if !any_deletion {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
@ -106,6 +124,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
if !any_addition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
|
@ -11,6 +11,7 @@ use super::helpers::{
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result};
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -9,9 +9,9 @@ mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use rayon::prelude::*;
|
||||
@ -30,9 +30,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{FieldId, FieldsIdsMap, Result};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
@ -43,18 +42,10 @@ pub(crate) fn data_from_obkv_documents(
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
stop_words: Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
exact_attributes: HashSet<FieldId>,
|
||||
proximity_precision: ProximityPrecision,
|
||||
embedders: EmbeddingConfigs,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
@ -67,8 +58,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
settings_diff.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
@ -81,13 +71,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
settings_diff.clone(),
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
@ -100,13 +86,12 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
let exact_attributes = exact_attributes.clone();
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
@ -118,10 +103,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| {
|
||||
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
|
||||
},
|
||||
extract_word_docids,
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
@ -139,6 +123,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
@ -152,6 +137,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
@ -161,22 +147,22 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -195,12 +181,17 @@ pub(crate) fn data_from_obkv_documents(
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
GrenadParameters,
|
||||
&InnerIndexSettingsDiff,
|
||||
) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
@ -213,7 +204,7 @@ fn run_extraction_task<FE, FS, M>(
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
match extract_fn(chunk, indexer) {
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
@ -230,8 +221,7 @@ fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
embedders: EmbeddingConfigs,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -239,55 +229,58 @@ fn send_original_documents_data(
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
|
||||
let request_threads = rayon::ThreadPoolBuilder::new()
|
||||
let request_threads = ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()?;
|
||||
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in embedders {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&field_id_map,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&request_threads,
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||
let settings_diff = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&request_threads,
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
}));
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: create a custom internal error
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
@ -306,13 +299,9 @@ fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
stop_words: &Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: &Option<&[&str]>,
|
||||
dictionary: &Option<&[&str]>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
@ -341,10 +330,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
searchable_fields,
|
||||
stop_words.as_ref(),
|
||||
*allowed_separators,
|
||||
*dictionary,
|
||||
&settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
@ -367,7 +353,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
faceted_fields,
|
||||
&settings_diff,
|
||||
geo_fields_ids,
|
||||
)?;
|
||||
|
||||
|
@ -6,15 +6,16 @@ mod typed_chunk;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Str;
|
||||
use heed::Database;
|
||||
use rand::SeedableRng;
|
||||
use rhai::{Dynamic, Engine, OptimizationLevel, Scope};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use slice_group_by::GroupBy;
|
||||
@ -31,14 +32,15 @@ pub use self::helpers::{
|
||||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
use crate::{all_obkv_to_json, CboRoaringBitmapCodec, FieldsIdsMap, Index, Object, Result};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||
@ -172,6 +174,119 @@ where
|
||||
Ok((self, Ok(indexed_documents)))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
|
||||
pub fn edit_documents(
|
||||
self,
|
||||
documents: &RoaringBitmap,
|
||||
context: Option<Object>,
|
||||
code: &str,
|
||||
) -> Result<(Self, StdResult<(u64, u64), UserError>)> {
|
||||
// Early return when there is no document to add
|
||||
if documents.is_empty() {
|
||||
return Ok((self, Ok((0, 0))));
|
||||
}
|
||||
|
||||
/// Transform every field of a raw obkv store into a Rhai Map.
|
||||
pub fn all_obkv_to_rhaimap(
|
||||
obkv: obkv::KvReaderU16,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
) -> Result<rhai::Map> {
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
all_keys
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
let name = fields_ids_map.name(id).ok_or(
|
||||
crate::error::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: id,
|
||||
process: "allobkv_to_rhaimap",
|
||||
},
|
||||
)?;
|
||||
let value = serde_json::from_slice(value)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
Ok((name.into(), value))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn rhaimap_to_object(map: rhai::Map) -> Object {
|
||||
let mut output = Object::new();
|
||||
for (key, value) in map {
|
||||
let value = serde_json::to_value(&value).unwrap();
|
||||
output.insert(key.into(), value);
|
||||
}
|
||||
output
|
||||
}
|
||||
|
||||
let mut engine = Engine::new();
|
||||
engine.set_optimization_level(OptimizationLevel::Full);
|
||||
//It is an arbitrary value. We need to let users define this in the settings.
|
||||
engine.set_max_operations(1_000_000);
|
||||
|
||||
let ast = engine.compile(code).unwrap();
|
||||
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
|
||||
let primary_key_id = fields_ids_map.id(primary_key).unwrap();
|
||||
let mut documents_batch_builder = tempfile::tempfile().map(DocumentsBatchBuilder::new)?;
|
||||
let mut documents_to_remove = RoaringBitmap::new();
|
||||
|
||||
let context: Dynamic = match context {
|
||||
Some(context) => serde_json::from_value(context.into()).unwrap(),
|
||||
None => Dynamic::from(()),
|
||||
};
|
||||
|
||||
for docid in documents {
|
||||
let (document, document_object, document_id) =
|
||||
match self.index.documents.get(self.wtxn, &docid)? {
|
||||
Some(obkv) => {
|
||||
let document_id_bytes = obkv.get(primary_key_id).unwrap();
|
||||
let document_id: serde_json::Value =
|
||||
serde_json::from_slice(document_id_bytes).unwrap();
|
||||
let document = all_obkv_to_rhaimap(obkv, &fields_ids_map)?;
|
||||
let document_object = all_obkv_to_json(obkv, &fields_ids_map)?;
|
||||
(document, document_object, document_id)
|
||||
}
|
||||
None => panic!("documents must exist"),
|
||||
};
|
||||
|
||||
let mut scope = Scope::new();
|
||||
scope.push_constant_dynamic("context", context.clone());
|
||||
scope.push("doc", document);
|
||||
let _ = engine.eval_ast_with_scope::<Dynamic>(&mut scope, &ast).unwrap();
|
||||
let new_document = match scope.remove::<Dynamic>("doc") {
|
||||
// If the "doc" variable has been removed from the scope
|
||||
// or set to (), we effectively delete the document.
|
||||
Some(doc) if doc.is_unit() => {
|
||||
documents_to_remove.push(docid);
|
||||
continue;
|
||||
}
|
||||
None => unreachable!(),
|
||||
Some(document) => match document.try_cast() {
|
||||
Some(document) => rhaimap_to_object(document),
|
||||
None => panic!("Why is \"doc\" no longer a Map?"),
|
||||
},
|
||||
};
|
||||
|
||||
if document_object != new_document {
|
||||
assert_eq!(
|
||||
Some(&document_id),
|
||||
new_document.get(primary_key),
|
||||
"you cannot change the document id when editing documents"
|
||||
);
|
||||
documents_batch_builder.append_json_object(&new_document)?;
|
||||
}
|
||||
}
|
||||
|
||||
let file = documents_batch_builder.into_inner()?;
|
||||
let reader = DocumentsBatchReader::from_reader(file)?;
|
||||
|
||||
let (this, removed) = self.remove_documents_from_db_no_batch(&documents_to_remove)?;
|
||||
let (this, result) = this.add_documents(reader)?;
|
||||
|
||||
Ok((this, result.map(|added| (removed, added))))
|
||||
}
|
||||
|
||||
pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
|
||||
self.embedders = embedders;
|
||||
self
|
||||
@ -259,21 +374,6 @@ where
|
||||
.expect("Invalid document addition state")
|
||||
.output_from_sorter(self.wtxn, &self.progress)?;
|
||||
|
||||
let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
|
||||
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
|
||||
|
||||
// in case new fields were introduced we're going to recreate the searchable fields.
|
||||
if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? {
|
||||
// we can't keep references on the faceted fields while we update the index thus we need to own it.
|
||||
let faceted_fields: Vec<String> =
|
||||
faceted_fields.into_iter().map(str::to_string).collect();
|
||||
self.index.put_all_searchable_fields_from_fields_ids_map(
|
||||
self.wtxn,
|
||||
&faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(),
|
||||
&output.fields_ids_map,
|
||||
)?;
|
||||
}
|
||||
|
||||
let indexed_documents = output.documents_count as u64;
|
||||
let number_of_documents = self.execute_raw(output)?;
|
||||
|
||||
@ -296,32 +396,35 @@ where
|
||||
|
||||
let TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
mut settings_diff,
|
||||
field_distribution,
|
||||
documents_count,
|
||||
original_documents,
|
||||
flattened_documents,
|
||||
} = output;
|
||||
|
||||
// The fields_ids_map is put back to the store now so the rest of the transaction sees an
|
||||
// up to date field map.
|
||||
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
|
||||
// update the internal facet and searchable list,
|
||||
// because they might have changed due to the nested documents flattening.
|
||||
settings_diff.new.recompute_facets(self.wtxn, self.index)?;
|
||||
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
||||
|
||||
let settings_diff = Arc::new(settings_diff);
|
||||
|
||||
let backup_pool;
|
||||
let pool = match self.indexer_config.thread_pool {
|
||||
Some(ref pool) => pool,
|
||||
#[cfg(not(test))]
|
||||
None => {
|
||||
// We initialize a bakcup pool with the default
|
||||
// We initialize a backup pool with the default
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
|
||||
&backup_pool
|
||||
}
|
||||
#[cfg(test)]
|
||||
None => {
|
||||
// We initialize a bakcup pool with the default
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
|
||||
#[allow(unused_mut)]
|
||||
let mut pool_builder = ThreadPoolNoAbortBuilder::new();
|
||||
|
||||
#[cfg(test)]
|
||||
{
|
||||
pool_builder = pool_builder.num_threads(1);
|
||||
}
|
||||
|
||||
backup_pool = pool_builder.build()?;
|
||||
&backup_pool
|
||||
}
|
||||
};
|
||||
@ -333,13 +436,8 @@ where
|
||||
) = crossbeam_channel::unbounded();
|
||||
|
||||
// get the primary key field id
|
||||
let primary_key_id = fields_ids_map.id(&primary_key).unwrap();
|
||||
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
|
||||
|
||||
// get searchable fields for word databases
|
||||
let searchable_fields =
|
||||
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
|
||||
// get filterable fields for facet databases
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
// get the fid of the `_geo.lat` and `_geo.lng` fields.
|
||||
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
@ -362,12 +460,6 @@ where
|
||||
None => None,
|
||||
};
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
||||
|
||||
let pool_params = GrenadParameters {
|
||||
chunk_compression_type: self.indexer_config.chunk_compression_type,
|
||||
chunk_compression_level: self.indexer_config.chunk_compression_level,
|
||||
@ -400,8 +492,6 @@ where
|
||||
|
||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||
|
||||
let cloned_embedder = self.embedders.clone();
|
||||
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
let mut databases_seen = 0;
|
||||
let mut word_position_docids = None;
|
||||
@ -410,7 +500,6 @@ where
|
||||
let mut exact_word_docids = None;
|
||||
let mut chunk_accumulator = ChunkAccumulator::default();
|
||||
let mut dimension = HashMap::new();
|
||||
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
@ -428,10 +517,6 @@ where
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
// extract all databases from the chunked obkv douments
|
||||
@ -440,18 +525,10 @@ where
|
||||
flattened_chunk,
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
settings_diff.clone(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
});
|
||||
|
||||
@ -571,7 +648,7 @@ where
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
}).map_err(InternalError::from)??;
|
||||
|
||||
// We write the field distribution into the main database
|
||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||
@ -600,7 +677,8 @@ where
|
||||
writer.build(wtxn, &mut rng, None)?;
|
||||
}
|
||||
Result::Ok(())
|
||||
})?;
|
||||
})
|
||||
.map_err(InternalError::from)??;
|
||||
}
|
||||
|
||||
self.execute_prefix_databases(
|
||||
|
@ -1,12 +1,11 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::Entry as BEntry;
|
||||
use std::collections::hash_map::Entry as HEntry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::RoTxn;
|
||||
use itertools::Itertools;
|
||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
@ -21,14 +20,17 @@ use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
|
||||
use crate::update::del_add::{
|
||||
del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd,
|
||||
};
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result};
|
||||
|
||||
pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
pub fields_ids_map: FieldsIdsMap,
|
||||
pub settings_diff: InnerIndexSettingsDiff,
|
||||
pub field_distribution: FieldDistribution,
|
||||
pub documents_count: usize,
|
||||
pub original_documents: File,
|
||||
@ -282,7 +284,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
let base_obkv = KvReader::new(base_obkv);
|
||||
if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
|
||||
if let Some(flattened_obkv) =
|
||||
Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
@ -315,7 +319,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
let flattened_obkv = KvReader::new(&obkv_buffer);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
if let Some(obkv) =
|
||||
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
@ -524,7 +530,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
// flatten it and push it as to delete in the flattened_sorter
|
||||
let flattened_obkv = KvReader::new(base_obkv);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
if let Some(obkv) =
|
||||
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
@ -541,8 +549,15 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
// Flatten a document from the fields ids map contained in self and insert the new
|
||||
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||
#[tracing::instrument(level = "trace", skip(self, obkv), target = "indexing::transform")]
|
||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip(obkv, fields_ids_map),
|
||||
target = "indexing::transform"
|
||||
)]
|
||||
fn flatten_from_fields_ids_map(
|
||||
obkv: &KvReader<FieldId>,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
) -> Result<Option<Vec<u8>>> {
|
||||
if obkv
|
||||
.iter()
|
||||
.all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
|
||||
@ -563,7 +578,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// all the raw values get inserted directly in the `key_value` vec.
|
||||
for (key, value) in obkv.iter() {
|
||||
if json_depth_checker::should_flatten_from_unchecked_slice(value) {
|
||||
let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
let key = fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: key,
|
||||
process: "Flatten from fields ids map.",
|
||||
})?;
|
||||
@ -581,7 +596,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// Once we have the flattened version we insert all the new generated fields_ids
|
||||
// (if any) in the fields ids map and serialize the value.
|
||||
for (key, value) in flattened.into_iter() {
|
||||
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
let fid = fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
key_value.push((fid, value.into()));
|
||||
}
|
||||
@ -792,9 +807,19 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
fst_new_external_documents_ids_builder.insert(key, value)
|
||||
})?;
|
||||
|
||||
let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?;
|
||||
let mut new_inner_settings = old_inner_settings.clone();
|
||||
new_inner_settings.fields_ids_map = self.fields_ids_map;
|
||||
let settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
embedding_configs_updated: false,
|
||||
settings_update_only: false,
|
||||
};
|
||||
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: self.fields_ids_map,
|
||||
settings_diff,
|
||||
field_distribution,
|
||||
documents_count: self.documents_count,
|
||||
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||
@ -804,6 +829,44 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
})
|
||||
}
|
||||
|
||||
/// Rebind the field_ids of the provided document to their values
|
||||
/// based on the field_ids_maps difference between the old and the new settings,
|
||||
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
|
||||
fn rebind_existing_document(
|
||||
old_obkv: KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
original_obkv_buffer: &mut Vec<u8>,
|
||||
flattened_obkv_buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone();
|
||||
let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone();
|
||||
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
|
||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||
for (id, name) in new_fields_ids_map.iter() {
|
||||
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) {
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
let data = obkv_writer.into_inner()?;
|
||||
let new_obkv = KvReader::<FieldId>::new(&data);
|
||||
|
||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
|
||||
let old_flattened =
|
||||
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
|
||||
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
|
||||
let new_flattened =
|
||||
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
|
||||
|
||||
original_obkv_buffer.clear();
|
||||
flattened_obkv_buffer.clear();
|
||||
|
||||
del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?;
|
||||
del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear all databases. Returns a `TransformOutput` with a file that contains the documents
|
||||
/// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
|
||||
///
|
||||
@ -811,8 +874,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
pub fn prepare_for_documents_reindexing(
|
||||
self,
|
||||
wtxn: &mut heed::RwTxn<'i>,
|
||||
old_fields_ids_map: FieldsIdsMap,
|
||||
mut new_fields_ids_map: FieldsIdsMap,
|
||||
settings_diff: InnerIndexSettingsDiff,
|
||||
) -> Result<TransformOutput> {
|
||||
// There already has been a document addition, the primary key should be set by now.
|
||||
let primary_key = self
|
||||
@ -848,78 +910,27 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.indexer_settings.max_memory.map(|mem| mem / 2),
|
||||
);
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut original_obkv_buffer = Vec::new();
|
||||
let mut flattened_obkv_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
for result in self.index.external_documents_ids().iter(wtxn)? {
|
||||
let (external_id, docid) = result?;
|
||||
let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||
let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
|
||||
)?;
|
||||
|
||||
obkv_buffer.clear();
|
||||
let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
|
||||
|
||||
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
|
||||
for (id, name) in new_fields_ids_map.iter() {
|
||||
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) {
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
|
||||
let buffer = obkv_writer.into_inner()?;
|
||||
Self::rebind_existing_document(
|
||||
old_obkv,
|
||||
&settings_diff,
|
||||
&mut original_obkv_buffer,
|
||||
&mut flattened_obkv_buffer,
|
||||
)?;
|
||||
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
|
||||
document_sorter_value_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// Once we have the document. We're going to flatten it
|
||||
// and insert it in the flattened sorter.
|
||||
let mut doc = serde_json::Map::new();
|
||||
|
||||
let reader = obkv::KvReader::new(buffer);
|
||||
for (k, v) in reader.iter() {
|
||||
let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
|
||||
field_id: k,
|
||||
process: "Accessing field distribution in transform.",
|
||||
})?;
|
||||
let value = serde_json::from_slice::<serde_json::Value>(v)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
doc.insert(key.to_string(), value);
|
||||
}
|
||||
|
||||
let flattened = flatten_serde_json::flatten(&doc);
|
||||
|
||||
// Once we have the flattened version we can convert it back to obkv and
|
||||
// insert all the new generated fields_ids (if any) in the fields ids map.
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
let mut writer = KvWriter::new(&mut buffer);
|
||||
let mut flattened: Vec<_> = flattened.into_iter().collect();
|
||||
// we reorder the field to get all the known field first
|
||||
flattened.sort_unstable_by_key(|(key, _)| {
|
||||
new_fields_ids_map.id(key).unwrap_or(FieldId::MAX)
|
||||
});
|
||||
|
||||
for (key, value) in flattened {
|
||||
let fid =
|
||||
new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
|
||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(fid, &value)?;
|
||||
}
|
||||
document_sorter_value_buffer.clear();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
|
||||
flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
|
||||
}
|
||||
|
||||
let grenad_params = GrenadParameters {
|
||||
@ -934,22 +945,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
|
||||
|
||||
let output = TransformOutput {
|
||||
Ok(TransformOutput {
|
||||
primary_key,
|
||||
fields_ids_map: new_fields_ids_map,
|
||||
field_distribution,
|
||||
settings_diff,
|
||||
documents_count,
|
||||
original_documents: original_documents.into_inner().into_inner(),
|
||||
flattened_documents: flattened_documents.into_inner().into_inner(),
|
||||
};
|
||||
|
||||
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
||||
self.index.put_faceted_fields(wtxn, &new_facets)?;
|
||||
|
||||
// We clear the full database (words-fst, documents ids and documents content).
|
||||
ClearDocuments::new(wtxn, self.index).execute()?;
|
||||
|
||||
Ok(output)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -964,20 +967,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
|
||||
vec.into_iter().map(|_| unreachable!()).collect()
|
||||
}
|
||||
|
||||
impl TransformOutput {
|
||||
// find and insert the new field ids
|
||||
pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
|
||||
let user_defined_facets = index.user_defined_faceted_fields(rtxn)?;
|
||||
|
||||
Ok(self
|
||||
.fields_ids_map
|
||||
.names()
|
||||
.filter(|&field| crate::is_faceted(field, &user_defined_facets))
|
||||
.map(|field| field.to_string())
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
@ -1,5 +1,6 @@
|
||||
use grenad::CompressionType;
|
||||
use rayon::ThreadPool;
|
||||
|
||||
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IndexerConfig {
|
||||
@ -9,7 +10,7 @@ pub struct IndexerConfig {
|
||||
pub max_memory: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub thread_pool: Option<ThreadPool>,
|
||||
pub thread_pool: Option<ThreadPoolNoAbort>,
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||
use crate::{FieldsIdsMap, Index, Result};
|
||||
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
|
||||
pub enum Setting<T> {
|
||||
@ -385,14 +385,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, old_fields_ids_map),
|
||||
skip(self, progress_callback, should_abort, settings_diff),
|
||||
target = "indexing::documents"
|
||||
)]
|
||||
fn reindex<FP, FA>(
|
||||
&mut self,
|
||||
progress_callback: &FP,
|
||||
should_abort: &FA,
|
||||
old_fields_ids_map: FieldsIdsMap,
|
||||
settings_diff: InnerIndexSettingsDiff,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -400,7 +400,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
// if the settings are set before any document update, we don't need to do anything, and
|
||||
// will set the primary key during the first document addition.
|
||||
if self.index.number_of_documents(self.wtxn)? == 0 {
|
||||
@ -416,14 +415,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
)?;
|
||||
|
||||
// We clear the databases and remap the documents fields based on the new `FieldsIdsMap`.
|
||||
let output = transform.prepare_for_documents_reindexing(
|
||||
self.wtxn,
|
||||
old_fields_ids_map,
|
||||
fields_ids_map,
|
||||
)?;
|
||||
|
||||
let embedder_configs = self.index.embedding_configs(self.wtxn)?;
|
||||
let embedders = self.embedders(embedder_configs)?;
|
||||
let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?;
|
||||
|
||||
// We index the generated `TransformOutput` which must contain
|
||||
// all the documents with fields in the newly defined searchable order.
|
||||
@ -436,32 +428,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
&should_abort,
|
||||
)?;
|
||||
|
||||
let indexing_builder = indexing_builder.with_embedders(embedders);
|
||||
indexing_builder.execute_raw(output)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn embedders(
|
||||
&self,
|
||||
embedding_configs: Vec<(String, EmbeddingConfig)>,
|
||||
) -> Result<EmbeddingConfigs> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|
||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||
|
||||
let embedder = Arc::new(
|
||||
Embedder::new(embedder_options.clone())
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?,
|
||||
);
|
||||
Ok((name, (embedder, prompt)))
|
||||
})
|
||||
.collect();
|
||||
res.map(EmbeddingConfigs::new)
|
||||
}
|
||||
|
||||
fn update_displayed(&mut self) -> Result<bool> {
|
||||
match self.displayed_fields {
|
||||
Setting::Set(ref fields) => {
|
||||
@ -1038,6 +1009,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
}
|
||||
Setting::NotSet => false,
|
||||
};
|
||||
|
||||
// if any changes force a reindexing
|
||||
// clear the vector database.
|
||||
if update {
|
||||
self.index.vector_arroy.clear(self.wtxn)?;
|
||||
}
|
||||
|
||||
Ok(update)
|
||||
}
|
||||
|
||||
@ -1066,20 +1044,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
{
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// Note: this MUST be before `update_sortable` so that we can get the old value to compare with the updated value afterwards
|
||||
|
||||
let existing_fields: HashSet<_> = self
|
||||
.index
|
||||
.field_distribution(self.wtxn)?
|
||||
.into_iter()
|
||||
.filter_map(|(field, count)| (count != 0).then_some(field))
|
||||
.collect();
|
||||
let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
|
||||
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||
|
||||
// never trigger re-indexing
|
||||
self.update_displayed()?;
|
||||
self.update_filterable()?;
|
||||
self.update_sortable()?;
|
||||
self.update_distinct_field()?;
|
||||
self.update_criteria()?;
|
||||
self.update_primary_key()?;
|
||||
@ -1089,16 +1057,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
self.update_max_values_per_facet()?;
|
||||
self.update_sort_facet_values_by()?;
|
||||
self.update_pagination_max_total_hits()?;
|
||||
self.update_search_cutoff()?;
|
||||
|
||||
let faceted_updated = self.update_faceted(existing_fields, old_faceted_fields)?;
|
||||
let stop_words_updated = self.update_stop_words()?;
|
||||
let non_separator_tokens_updated = self.update_non_separator_tokens()?;
|
||||
let separator_tokens_updated = self.update_separator_tokens()?;
|
||||
let dictionary_updated = self.update_dictionary()?;
|
||||
let synonyms_updated = self.update_synonyms()?;
|
||||
let searchable_updated = self.update_searchable()?;
|
||||
let exact_attributes_updated = self.update_exact_attributes()?;
|
||||
let proximity_precision = self.update_proximity_precision()?;
|
||||
// could trigger re-indexing
|
||||
self.update_filterable()?;
|
||||
self.update_sortable()?;
|
||||
self.update_stop_words()?;
|
||||
self.update_non_separator_tokens()?;
|
||||
self.update_separator_tokens()?;
|
||||
self.update_dictionary()?;
|
||||
self.update_synonyms()?;
|
||||
self.update_searchable()?;
|
||||
self.update_exact_attributes()?;
|
||||
self.update_proximity_precision()?;
|
||||
// TODO: very rough approximation of the needs for reindexing where any change will result in
|
||||
// a full reindexing.
|
||||
// What can be done instead:
|
||||
@ -1107,53 +1078,193 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
||||
|
||||
// never trigger re-indexing
|
||||
self.update_search_cutoff()?;
|
||||
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||
let inner_settings_diff = InnerIndexSettingsDiff {
|
||||
old: old_inner_settings,
|
||||
new: new_inner_settings,
|
||||
embedding_configs_updated,
|
||||
settings_update_only: true,
|
||||
};
|
||||
|
||||
if stop_words_updated
|
||||
|| non_separator_tokens_updated
|
||||
|| separator_tokens_updated
|
||||
|| dictionary_updated
|
||||
|| faceted_updated
|
||||
|| synonyms_updated
|
||||
|| searchable_updated
|
||||
|| exact_attributes_updated
|
||||
|| proximity_precision
|
||||
|| embedding_configs_updated
|
||||
{
|
||||
self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
|
||||
if inner_settings_diff.any_reindexing_needed() {
|
||||
self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn update_faceted(
|
||||
&self,
|
||||
existing_fields: HashSet<String>,
|
||||
old_faceted_fields: HashSet<String>,
|
||||
) -> Result<bool> {
|
||||
pub struct InnerIndexSettingsDiff {
|
||||
pub(crate) old: InnerIndexSettings,
|
||||
pub(crate) new: InnerIndexSettings,
|
||||
|
||||
// TODO: compare directly the embedders.
|
||||
pub(crate) embedding_configs_updated: bool,
|
||||
|
||||
pub(crate) settings_update_only: bool,
|
||||
}
|
||||
|
||||
impl InnerIndexSettingsDiff {
|
||||
pub fn any_reindexing_needed(&self) -> bool {
|
||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||
}
|
||||
|
||||
pub fn reindex_searchable(&self) -> bool {
|
||||
self.old
|
||||
.fields_ids_map
|
||||
.iter()
|
||||
.zip(self.new.fields_ids_map.iter())
|
||||
.any(|(old, new)| old != new)
|
||||
|| self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|
||||
|| self.old.allowed_separators != self.new.allowed_separators
|
||||
|| self.old.dictionary != self.new.dictionary
|
||||
|| self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
|
||||
|| self.old.exact_attributes != self.new.exact_attributes
|
||||
|| self.old.proximity_precision != self.new.proximity_precision
|
||||
}
|
||||
|
||||
pub fn reindex_facets(&self) -> bool {
|
||||
let existing_fields = &self.new.existing_fields;
|
||||
if existing_fields.iter().any(|field| field.contains('.')) {
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
let old_faceted_fields = &self.old.user_defined_faceted_fields;
|
||||
if old_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If there is new faceted fields we indicate that we must reindex as we must
|
||||
// index new fields as facets. It means that the distinct attribute,
|
||||
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
||||
let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
|
||||
|
||||
let new_faceted_fields = &self.new.user_defined_faceted_fields;
|
||||
if new_faceted_fields.iter().any(|field| field.contains('.')) {
|
||||
return Ok(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
let faceted_updated =
|
||||
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
|
||||
(existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
|
||||
|
||||
Ok(faceted_updated)
|
||||
self.old
|
||||
.fields_ids_map
|
||||
.iter()
|
||||
.zip(self.new.fields_ids_map.iter())
|
||||
.any(|(old, new)| old != new)
|
||||
|| faceted_updated
|
||||
}
|
||||
|
||||
pub fn reindex_vectors(&self) -> bool {
|
||||
self.embedding_configs_updated
|
||||
}
|
||||
|
||||
pub fn settings_update_only(&self) -> bool {
|
||||
self.settings_update_only
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct InnerIndexSettings {
|
||||
pub stop_words: Option<fst::Set<Vec<u8>>>,
|
||||
pub allowed_separators: Option<BTreeSet<String>>,
|
||||
pub dictionary: Option<BTreeSet<String>>,
|
||||
pub fields_ids_map: FieldsIdsMap,
|
||||
pub user_defined_faceted_fields: HashSet<String>,
|
||||
pub user_defined_searchable_fields: Option<Vec<String>>,
|
||||
pub faceted_fields_ids: HashSet<FieldId>,
|
||||
pub searchable_fields_ids: Option<Vec<FieldId>>,
|
||||
pub exact_attributes: HashSet<FieldId>,
|
||||
pub proximity_precision: ProximityPrecision,
|
||||
pub embedding_configs: EmbeddingConfigs,
|
||||
pub existing_fields: HashSet<String>,
|
||||
}
|
||||
|
||||
impl InnerIndexSettings {
|
||||
pub fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> {
|
||||
let stop_words = index.stop_words(rtxn)?;
|
||||
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||
let allowed_separators = index.allowed_separators(rtxn)?;
|
||||
let dictionary = index.dictionary(rtxn)?;
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
|
||||
let user_defined_searchable_fields =
|
||||
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
|
||||
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
|
||||
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
|
||||
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
|
||||
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
||||
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
||||
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
|
||||
let existing_fields: HashSet<_> = index
|
||||
.field_distribution(rtxn)?
|
||||
.into_iter()
|
||||
.filter_map(|(field, count)| (count != 0).then_some(field))
|
||||
.collect();
|
||||
|
||||
Ok(Self {
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
fields_ids_map,
|
||||
user_defined_faceted_fields,
|
||||
user_defined_searchable_fields,
|
||||
faceted_fields_ids,
|
||||
searchable_fields_ids,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
embedding_configs,
|
||||
existing_fields,
|
||||
})
|
||||
}
|
||||
|
||||
// find and insert the new field ids
|
||||
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
||||
let new_facets = self
|
||||
.fields_ids_map
|
||||
.names()
|
||||
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
|
||||
.map(|field| field.to_string())
|
||||
.collect();
|
||||
index.put_faceted_fields(wtxn, &new_facets)?;
|
||||
|
||||
self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// find and insert the new field ids
|
||||
pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
|
||||
// in case new fields were introduced we're going to recreate the searchable fields.
|
||||
if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() {
|
||||
let searchable_fields =
|
||||
searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
|
||||
index.put_all_searchable_fields_from_fields_ids_map(
|
||||
wtxn,
|
||||
&searchable_fields,
|
||||
&self.fields_ids_map,
|
||||
)?;
|
||||
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
|
||||
self.searchable_fields_ids = searchable_fields_ids;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|
||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||
|
||||
let embedder = Arc::new(
|
||||
Embedder::new(embedder_options.clone())
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?,
|
||||
);
|
||||
Ok((name, (embedder, prompt)))
|
||||
})
|
||||
.collect();
|
||||
res.map(EmbeddingConfigs::new)
|
||||
}
|
||||
|
||||
fn validate_prompt(
|
||||
@ -1643,6 +1754,70 @@ mod tests {
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
|
||||
// Set the filterable fields to be the age and the name.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset! { S("age"), S("name") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.filterable_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids, hashset! { S("age"), S("name") });
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 1, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 4);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 5);
|
||||
|
||||
// Remove the age from the filterable fields.
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset! { S("name") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// Check that the displayed fields are correctly set.
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids = index.filterable_fields(&rtxn).unwrap();
|
||||
assert_eq!(fields_ids, hashset! { S("name") });
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 1, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 0);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
// Only count the field_id 0 and level 0 facet values.
|
||||
let count = index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(&rtxn, &[0, 0])
|
||||
.unwrap()
|
||||
.count();
|
||||
assert_eq!(count, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -3,6 +3,7 @@ use std::path::PathBuf;
|
||||
use hf_hub::api::sync::ApiError;
|
||||
|
||||
use crate::error::FaultSource;
|
||||
use crate::PanicCatched;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Error while generating embeddings: {inner}")]
|
||||
@ -80,6 +81,8 @@ pub enum EmbedErrorKind {
|
||||
OpenAiUnexpectedDimension(usize, usize),
|
||||
#[error("no embedding was produced")]
|
||||
MissingEmbedding,
|
||||
#[error(transparent)]
|
||||
PanicInThreadPool(#[from] PanicCatched),
|
||||
}
|
||||
|
||||
impl EmbedError {
|
||||
|
@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use self::error::{EmbedError, NewEmbedderError};
|
||||
use crate::prompt::{Prompt, PromptData};
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
pub mod error;
|
||||
pub mod hf;
|
||||
@ -254,7 +255,7 @@ impl Embedder {
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &rayon::ThreadPool,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),
|
||||
|
@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
|
||||
use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind};
|
||||
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
|
||||
use super::{DistributionShift, Embeddings};
|
||||
use crate::error::FaultSource;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Embedder {
|
||||
@ -71,11 +73,16 @@ impl Embedder {
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &rayon::ThreadPool,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
threads.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
fault: FaultSource::Bug,
|
||||
})?
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
|
@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
|
||||
use super::error::{EmbedError, NewEmbedderError};
|
||||
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
|
||||
use super::{DistributionShift, Embeddings};
|
||||
use crate::error::FaultSource;
|
||||
use crate::vector::error::EmbedErrorKind;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
@ -241,11 +243,16 @@ impl Embedder {
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &rayon::ThreadPool,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
threads.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
fault: FaultSource::Bug,
|
||||
})?
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
|
@ -2,9 +2,12 @@ use deserr::Deserr;
|
||||
use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::error::EmbedErrorKind;
|
||||
use super::{
|
||||
DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM,
|
||||
};
|
||||
use crate::error::FaultSource;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
// retrying in case of failure
|
||||
|
||||
@ -158,11 +161,16 @@ impl Embedder {
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &rayon::ThreadPool,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
threads.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
fault: FaultSource::Bug,
|
||||
})?
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
|
@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
fn from(value: EmbeddingConfig) -> Self {
|
||||
let EmbeddingConfig { embedder_options, prompt } = value;
|
||||
match embedder_options {
|
||||
super::EmbedderOptions::HuggingFace(options) => Self {
|
||||
super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions {
|
||||
model,
|
||||
revision,
|
||||
distribution,
|
||||
}) => Self {
|
||||
source: Setting::Set(EmbedderSource::HuggingFace),
|
||||
model: Setting::Set(options.model),
|
||||
revision: options.revision.map(Setting::Set).unwrap_or_default(),
|
||||
model: Setting::Set(model),
|
||||
revision: revision.map(Setting::Set).unwrap_or_default(),
|
||||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::NotSet,
|
||||
document_template: Setting::Set(prompt.template),
|
||||
@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
path_to_embeddings: Setting::NotSet,
|
||||
embedding_object: Setting::NotSet,
|
||||
input_type: Setting::NotSet,
|
||||
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
|
||||
distribution: distribution.map(Setting::Set).unwrap_or_default(),
|
||||
},
|
||||
super::EmbedderOptions::OpenAi(options) => Self {
|
||||
super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions {
|
||||
api_key,
|
||||
embedding_model,
|
||||
dimensions,
|
||||
distribution,
|
||||
}) => Self {
|
||||
source: Setting::Set(EmbedderSource::OpenAi),
|
||||
model: Setting::Set(options.embedding_model.name().to_owned()),
|
||||
model: Setting::Set(embedding_model.name().to_owned()),
|
||||
revision: Setting::NotSet,
|
||||
api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
|
||||
dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(),
|
||||
api_key: api_key.map(Setting::Set).unwrap_or_default(),
|
||||
dimensions: dimensions.map(Setting::Set).unwrap_or_default(),
|
||||
document_template: Setting::Set(prompt.template),
|
||||
url: Setting::NotSet,
|
||||
query: Setting::NotSet,
|
||||
@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
path_to_embeddings: Setting::NotSet,
|
||||
embedding_object: Setting::NotSet,
|
||||
input_type: Setting::NotSet,
|
||||
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
|
||||
distribution: distribution.map(Setting::Set).unwrap_or_default(),
|
||||
},
|
||||
super::EmbedderOptions::Ollama(options) => Self {
|
||||
super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions {
|
||||
embedding_model,
|
||||
url,
|
||||
api_key,
|
||||
distribution,
|
||||
}) => Self {
|
||||
source: Setting::Set(EmbedderSource::Ollama),
|
||||
model: Setting::Set(options.embedding_model.to_owned()),
|
||||
model: Setting::Set(embedding_model),
|
||||
revision: Setting::NotSet,
|
||||
api_key: Setting::NotSet,
|
||||
api_key: api_key.map(Setting::Set).unwrap_or_default(),
|
||||
dimensions: Setting::NotSet,
|
||||
document_template: Setting::Set(prompt.template),
|
||||
url: Setting::NotSet,
|
||||
url: url.map(Setting::Set).unwrap_or_default(),
|
||||
query: Setting::NotSet,
|
||||
input_field: Setting::NotSet,
|
||||
path_to_embeddings: Setting::NotSet,
|
||||
embedding_object: Setting::NotSet,
|
||||
input_type: Setting::NotSet,
|
||||
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
|
||||
distribution: distribution.map(Setting::Set).unwrap_or_default(),
|
||||
},
|
||||
super::EmbedderOptions::UserProvided(options) => Self {
|
||||
super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
|
||||
dimensions,
|
||||
distribution,
|
||||
}) => Self {
|
||||
source: Setting::Set(EmbedderSource::UserProvided),
|
||||
model: Setting::NotSet,
|
||||
revision: Setting::NotSet,
|
||||
api_key: Setting::NotSet,
|
||||
dimensions: Setting::Set(options.dimensions),
|
||||
dimensions: Setting::Set(dimensions),
|
||||
document_template: Setting::NotSet,
|
||||
url: Setting::NotSet,
|
||||
query: Setting::NotSet,
|
||||
@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
path_to_embeddings: Setting::NotSet,
|
||||
embedding_object: Setting::NotSet,
|
||||
input_type: Setting::NotSet,
|
||||
distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
|
||||
distribution: distribution.map(Setting::Set).unwrap_or_default(),
|
||||
},
|
||||
super::EmbedderOptions::Rest(super::rest::EmbedderOptions {
|
||||
api_key,
|
||||
|
@ -217,9 +217,7 @@ fn add_memory_samples(
|
||||
memory_counters: &mut Option<MemoryCounterHandles>,
|
||||
last_memory: &mut MemoryStats,
|
||||
) -> Option<MemoryStats> {
|
||||
let Some(stats) = memory else {
|
||||
return None;
|
||||
};
|
||||
let stats = memory?;
|
||||
|
||||
let memory_counters =
|
||||
memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main));
|
||||
|
68
workloads/movies-subset-hf-embeddings.json
Normal file
68
workloads/movies-subset-hf-embeddings.json
Normal file
@ -0,0 +1,68 @@
|
||||
{
|
||||
"name": "movies-subset-hf-embeddings",
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
"assets": {
|
||||
"movies-100.json": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
|
||||
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
|
||||
}
|
||||
},
|
||||
"commands": [
|
||||
{
|
||||
"route": "experimental-features",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"vectorStore": true
|
||||
}
|
||||
},
|
||||
"synchronous": "DontWait"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/settings",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"searchableAttributes": [
|
||||
"title",
|
||||
"overview"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"release_date"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"release_date"
|
||||
]
|
||||
}
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/settings",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"embedders": {
|
||||
"default": {
|
||||
"source": "huggingFace"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "movies-100.json"
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
}
|
||||
]
|
||||
}
|
72
workloads/settings-add-embeddings.json
Normal file
72
workloads/settings-add-embeddings.json
Normal file
@ -0,0 +1,72 @@
|
||||
{
|
||||
"name": "settings-add-embeddings-hf",
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
"assets": {
|
||||
"movies-100.json": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
|
||||
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
|
||||
}
|
||||
},
|
||||
"commands": [
|
||||
{
|
||||
"route": "experimental-features",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"vectorStore": true
|
||||
}
|
||||
},
|
||||
"synchronous": "DontWait"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/settings",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"searchableAttributes": [
|
||||
"title",
|
||||
"overview"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"release_date"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"release_date"
|
||||
]
|
||||
}
|
||||
},
|
||||
"synchronous": "DontWait"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "movies-100.json"
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/settings",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"embedders": {
|
||||
"default": {
|
||||
"source": "huggingFace",
|
||||
"model": null,
|
||||
"revision": null,
|
||||
"documentTemplate": null,
|
||||
"distribution": null
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
}
|
||||
]
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "settings-add-remove-filters.json",
|
||||
"run_count": 2,
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "settings-proximity-precision.json",
|
||||
"run_count": 2,
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "settings-remove-add-swap-searchable.json",
|
||||
"run_count": 2,
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "settings-typo.json",
|
||||
"run_count": 2,
|
||||
"run_count": 5,
|
||||
"extra_cli_args": [
|
||||
"--max-indexing-threads=4"
|
||||
],
|
||||
|
Reference in New Issue
Block a user