write the dump export

This commit is contained in:
Tamo
2022-10-13 15:02:59 +02:00
committed by Clément Renault
parent 8954b1bd1d
commit b7f9c94f4a
25 changed files with 686 additions and 184 deletions

View File

@@ -9,5 +9,7 @@ pub mod tasks;
pub use milli;
pub use milli::heed;
pub use milli::Index;
use uuid::Uuid;
pub type Document = serde_json::Map<String, serde_json::Value>;
pub type InstanceUid = Uuid;

View File

@@ -2,9 +2,15 @@ use std::collections::{BTreeMap, BTreeSet};
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use fst::IntoStreamer;
use milli::update::Setting;
use milli::{Index, DEFAULT_VALUES_PER_FACET};
use serde::{Deserialize, Serialize, Serializer};
/// The maximimum number of results that the engine
/// will be able to return in one search call.
pub const DEFAULT_PAGINATION_MAX_TOTAL_HITS: usize = 1000;
fn serialize_with_wildcard<S>(
field: &Setting<Vec<String>>,
s: S,
@@ -366,6 +372,114 @@ pub fn apply_settings_to_builder(
}
}
pub fn settings(
index: &Index,
rtxn: &crate::heed::RoTxn,
) -> Result<Settings<Checked>, milli::Error> {
let displayed_attributes = index
.displayed_fields(rtxn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let searchable_attributes = index
.user_defined_searchable_fields(rtxn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect();
let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect();
let criteria = index
.criteria(rtxn)?
.into_iter()
.map(|c| c.to_string())
.collect();
let stop_words = index
.stop_words(rtxn)?
.map(|stop_words| -> Result<BTreeSet<_>, milli::Error> {
Ok(stop_words.stream().into_strs()?.into_iter().collect())
})
.transpose()?
.unwrap_or_default();
let distinct_field = index.distinct_field(rtxn)?.map(String::from);
// in milli each word in the synonyms map were split on their separator. Since we lost
// this information we are going to put space between words.
let synonyms = index
.synonyms(rtxn)?
.iter()
.map(|(key, values)| {
(
key.join(" "),
values.iter().map(|value| value.join(" ")).collect(),
)
})
.collect();
let min_typo_word_len = MinWordSizeTyposSetting {
one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?),
two_typos: Setting::Set(index.min_word_len_two_typos(rtxn)?),
};
let disabled_words = match index.exact_words(rtxn)? {
Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
None => BTreeSet::new(),
};
let disabled_attributes = index
.exact_attributes(rtxn)?
.into_iter()
.map(String::from)
.collect();
let typo_tolerance = TypoSettings {
enabled: Setting::Set(index.authorize_typos(rtxn)?),
min_word_size_for_typos: Setting::Set(min_typo_word_len),
disable_on_words: Setting::Set(disabled_words),
disable_on_attributes: Setting::Set(disabled_attributes),
};
let faceting = FacetingSettings {
max_values_per_facet: Setting::Set(
index
.max_values_per_facet(rtxn)?
.unwrap_or(DEFAULT_VALUES_PER_FACET),
),
};
let pagination = PaginationSettings {
max_total_hits: Setting::Set(
index
.pagination_max_total_hits(rtxn)?
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
),
};
Ok(Settings {
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria),
stop_words: Setting::Set(stop_words),
distinct_attribute: match distinct_field {
Some(field) => Setting::Set(field),
None => Setting::Reset,
},
synonyms: Setting::Set(synonyms),
typo_tolerance: Setting::Set(typo_tolerance),
faceting: Setting::Set(faceting),
pagination: Setting::Set(pagination),
_kind: PhantomData,
})
}
#[cfg(test)]
pub(crate) mod test {
use proptest::prelude::*;

View File

@@ -3,7 +3,6 @@ use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize, Serializer};
use std::{
fmt::{Display, Write},
path::PathBuf,
str::FromStr,
};
use time::{Duration, OffsetDateTime};
@@ -11,7 +10,9 @@ use uuid::Uuid;
use crate::{
error::{Code, ResponseError},
keys::Key,
settings::{Settings, Unchecked},
InstanceUid,
};
pub type TaskId = u32;
@@ -71,6 +72,26 @@ impl Task {
IndexSwap { lhs, rhs } => Some(vec![lhs, rhs]),
}
}
/// Return the content-uuid if there is one
pub fn content_uuid(&self) -> Option<&Uuid> {
match self.kind {
KindWithContent::DocumentImport {
ref content_file, ..
} => Some(content_file),
KindWithContent::DocumentDeletion { .. }
| KindWithContent::DocumentClear { .. }
| KindWithContent::Settings { .. }
| KindWithContent::IndexDeletion { .. }
| KindWithContent::IndexCreation { .. }
| KindWithContent::IndexUpdate { .. }
| KindWithContent::IndexSwap { .. }
| KindWithContent::CancelTask { .. }
| KindWithContent::DeleteTasks { .. }
| KindWithContent::DumpExport { .. }
| KindWithContent::Snapshot => None,
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -120,7 +141,9 @@ pub enum KindWithContent {
tasks: RoaringBitmap,
},
DumpExport {
output: PathBuf,
dump_uid: String,
keys: Vec<Key>,
instance_uid: Option<InstanceUid>,
},
Snapshot,
}
@@ -167,7 +190,7 @@ impl KindWithContent {
documents_count, ..
} => Some(Details::DocumentAddition {
received_documents: *documents_count,
indexed_documents: 0,
indexed_documents: Some(0),
}),
KindWithContent::DocumentDeletion {
index_uid: _,
@@ -204,6 +227,38 @@ impl KindWithContent {
}
}
impl From<&KindWithContent> for Option<Details> {
fn from(kind: &KindWithContent) -> Self {
match kind {
KindWithContent::DocumentImport {
documents_count, ..
} => Some(Details::DocumentAddition {
received_documents: *documents_count,
indexed_documents: None,
}),
KindWithContent::DocumentDeletion { .. } => None,
KindWithContent::DocumentClear { .. } => None,
KindWithContent::Settings { new_settings, .. } => Some(Details::Settings {
settings: new_settings.clone(),
}),
KindWithContent::IndexDeletion { .. } => None,
KindWithContent::IndexCreation { primary_key, .. } => Some(Details::IndexInfo {
primary_key: primary_key.clone(),
}),
KindWithContent::IndexUpdate { primary_key, .. } => Some(Details::IndexInfo {
primary_key: primary_key.clone(),
}),
KindWithContent::IndexSwap { .. } => None,
KindWithContent::CancelTask { .. } => None,
KindWithContent::DeleteTasks { .. } => todo!(),
KindWithContent::DumpExport { dump_uid, .. } => Some(Details::Dump {
dump_uid: dump_uid.clone(),
}),
KindWithContent::Snapshot => None,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub enum Status {
@@ -289,7 +344,7 @@ impl FromStr for Kind {
pub enum Details {
DocumentAddition {
received_documents: u64,
indexed_documents: u64,
indexed_documents: Option<u64>,
},
Settings {
settings: Settings<Unchecked>,