Merge branch 'release-v0.30.0' into stable

This commit is contained in:
curquiza
2022-11-28 13:56:35 +01:00
334 changed files with 29672 additions and 10603 deletions

View File

@@ -1,19 +0,0 @@
# Seeds for failure cases proptest has generated in the past. It is
# automatically read and these particular cases re-run before any
# novel cases are generated.
#
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 6f3ae3cba934ba3e328e2306218c32f27a46ce2d54a1258b05fef65663208662 # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: DocumentAddition { content_uuid: 37bc137d-2038-47f0-819f-b133233daadc, merge_strategy: ReplaceDocuments, primary_key: None, documents_count: 0 }, events: [] }
cc b726f7d9f44a9216aad302ddba0f04e7108817e741d656a4759aea8562de4d63 # shrinks to task = Task { id: 0, index_uid: IndexUid("_"), content: IndexDeletion, events: [] }, index_exists = false, index_op_fails = false, any_int = 0
cc 427ec2dde3260b1ab334207bdc22adef28a5b8532b9902c84b55fd2c017ea7e1 # shrinks to task = Task { id: 0, index_uid: IndexUid("A"), content: IndexDeletion, events: [] }, index_exists = true, index_op_fails = false, any_int = 0
cc c24f3d42f0f36fbdbf4e9d4327e75529b163ac580d63a5934ca05e9b5bd23a65 # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: IndexDeletion, events: [] }, index_exists = true, index_op_fails = true, any_int = 0
cc 8084e2410801b997533b0bcbad75cd212873cfc2677f26847f68c568ead1604c # shrinks to task = Task { id: 0, index_uid: IndexUid("A"), content: SettingsUpdate { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, synonyms: NotSet, distinct_attribute: NotSet, _kind: PhantomData }, is_deletion: false }, events: [] }, index_exists = false, index_op_fails = false, any_int = 0
cc 330085e0200a9a2ddfdd764a03d768aa95c431bcaafbd530c8c949425beed18b # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: CreateIndex { primary_key: None }, events: [] }, index_exists = false, index_op_fails = true, any_int = 0
cc c70e901576ef2fb9622e814bdecd11e4747cd70d71a9a6ce771b5b7256a187c0 # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: SettingsUpdate { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, synonyms: NotSet, distinct_attribute: NotSet, _kind: PhantomData }, is_deletion: true }, events: [] }, index_exists = false, index_op_fails = false, any_int = 0
cc 3fe2c38cbc2cca34ecde321472141d386056f0cd332cbf700773657715a382b5 # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: UpdateIndex { primary_key: None }, events: [] }, index_exists = false, index_op_fails = false, any_int = 0
cc c31cf86692968483f1ab08a6a9d4667ccb9635c306998551bf1eb1f135ef0d4b # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: UpdateIndex { primary_key: Some("") }, events: [] }, index_exists = true, index_op_fails = false, any_int = 0
cc 3a01c78db082434b8a4f8914abf0d1059d39f4426d16df20d72e1bd7ebb94a6a # shrinks to task = Task { id: 0, index_uid: IndexUid("0"), content: UpdateIndex { primary_key: None }, events: [] }, index_exists = true, index_op_fails = true, any_int = 0
cc c450806df3921d1e6fe9b6af93d999e8196d0175b69b64f1810802582421e94a # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: CreateIndex { primary_key: Some("") }, events: [] }, index_exists = false, index_op_fails = false, any_int = 0
cc fb6b98947cbdbdee05ed3c0bf2923aad2c311edc276253642eb43a0c0ec4888a # shrinks to task = Task { id: 0, index_uid: IndexUid("A"), content: CreateIndex { primary_key: Some("") }, events: [] }, index_exists = false, index_op_fails = true, any_int = 0
cc 1aa59d8e22484e9915efbb5818e1e1ab684aa61b166dc82130d6221663ba00bf # shrinks to task = Task { id: 0, index_uid: IndexUid("a"), content: DocumentDeletion(Clear), events: [] }, index_exists = true, index_op_fails = false, any_int = 0

View File

@@ -1,7 +0,0 @@
# Seeds for failure cases proptest has generated in the past. It is
# automatically read and these particular cases re-run before any
# novel cases are generated.
#
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 8cbd6c45ce8c5611ec3f2f94fd485f6a8eeccc470fa426e59bdfd4d9e7fce0e1 # shrinks to bytes = []

View File

@@ -1,8 +0,0 @@
use std::{fs, path::Path};
/// Copy the `instance-uid` contained in one db to another. Ignore all errors.
pub fn copy_user_id(src: &Path, dst: &Path) {
if let Ok(user_id) = fs::read_to_string(src.join("instance-uid")) {
let _ = fs::write(dst.join("instance-uid"), &user_id);
}
}

View File

@@ -1,26 +0,0 @@
use std::fs::{create_dir_all, File};
use std::io::Write;
use std::path::Path;
use flate2::{read::GzDecoder, write::GzEncoder, Compression};
use tar::{Archive, Builder};
pub fn to_tar_gz(src: impl AsRef<Path>, dest: impl AsRef<Path>) -> anyhow::Result<()> {
let mut f = File::create(dest)?;
let gz_encoder = GzEncoder::new(&mut f, Compression::default());
let mut tar_encoder = Builder::new(gz_encoder);
tar_encoder.append_dir_all(".", src)?;
let gz_encoder = tar_encoder.into_inner()?;
gz_encoder.finish()?;
f.flush()?;
Ok(())
}
pub fn from_tar_gz(src: impl AsRef<Path>, dest: impl AsRef<Path>) -> anyhow::Result<()> {
let f = File::open(&src)?;
let gz = GzDecoder::new(f);
let mut ar = Archive::new(gz);
create_dir_all(&dest)?;
ar.unpack(&dest)?;
Ok(())
}

View File

@@ -1,155 +0,0 @@
use std::borrow::Borrow;
use std::fmt::{self, Debug, Display};
use std::io::{self, BufReader, Read, Seek, Write};
use either::Either;
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::internal_error;
use milli::documents::{DocumentsBatchBuilder, Error};
use milli::Object;
use serde::Deserialize;
type Result<T> = std::result::Result<T, DocumentFormatError>;
#[derive(Debug)]
pub enum PayloadType {
Ndjson,
Json,
Csv,
}
impl fmt::Display for PayloadType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PayloadType::Ndjson => f.write_str("ndjson"),
PayloadType::Json => f.write_str("json"),
PayloadType::Csv => f.write_str("csv"),
}
}
}
#[derive(Debug)]
pub enum DocumentFormatError {
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
MalformedPayload(Error, PayloadType),
}
impl Display for DocumentFormatError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Internal(e) => write!(f, "An internal error has occurred: `{}`.", e),
Self::MalformedPayload(me, b) => match me.borrow() {
Error::Json(se) => {
// https://github.com/meilisearch/meilisearch/issues/2107
// The user input maybe insanely long. We need to truncate it.
let mut serde_msg = se.to_string();
let ellipsis = "...";
if serde_msg.len() > 100 + ellipsis.len() {
serde_msg.replace_range(50..serde_msg.len() - 85, ellipsis);
}
write!(
f,
"The `{}` payload provided is malformed. `Couldn't serialize document value: {}`.",
b, serde_msg
)
}
_ => write!(f, "The `{}` payload provided is malformed: `{}`.", b, me),
},
}
}
}
impl std::error::Error for DocumentFormatError {}
impl From<(PayloadType, Error)> for DocumentFormatError {
fn from((ty, error): (PayloadType, Error)) -> Self {
match error {
Error::Io(e) => Self::Internal(Box::new(e)),
e => Self::MalformedPayload(e, ty),
}
}
}
impl ErrorCode for DocumentFormatError {
fn error_code(&self) -> Code {
match self {
DocumentFormatError::Internal(_) => Code::Internal,
DocumentFormatError::MalformedPayload(_, _) => Code::MalformedPayload,
}
}
}
internal_error!(DocumentFormatError: io::Error);
/// Reads CSV from input and write an obkv batch to writer.
pub fn read_csv(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let csv = csv::Reader::from_reader(input);
builder.append_csv(csv).map_err(|e| (PayloadType::Csv, e))?;
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}
/// Reads JSON Lines from input and write an obkv batch to writer.
pub fn read_ndjson(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let reader = BufReader::new(input);
for result in serde_json::Deserializer::from_reader(reader).into_iter() {
let object = result
.map_err(Error::Json)
.map_err(|e| (PayloadType::Ndjson, e))?;
builder
.append_json_object(&object)
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
}
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}
/// Reads JSON from input and write an obkv batch to writer.
pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
let mut builder = DocumentsBatchBuilder::new(writer);
let reader = BufReader::new(input);
#[derive(Deserialize, Debug)]
#[serde(transparent)]
struct ArrayOrSingleObject {
#[serde(with = "either::serde_untagged")]
inner: Either<Vec<Object>, Object>,
}
let content: ArrayOrSingleObject = serde_json::from_reader(reader)
.map_err(Error::Json)
.map_err(|e| (PayloadType::Json, e))?;
for object in content.inner.map_right(|o| vec![o]).into_inner() {
builder
.append_json_object(&object)
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
}
let count = builder.documents_count();
let _ = builder
.into_inner()
.map_err(Into::into)
.map_err(DocumentFormatError::Internal)?;
Ok(count as usize)
}

View File

@@ -1,17 +0,0 @@
pub mod v2;
pub mod v3;
pub mod v4;
/// Parses the v1 version of the Asc ranking rules `asc(price)`and returns the field name.
pub fn asc_ranking_rule(text: &str) -> Option<&str> {
text.split_once("asc(")
.and_then(|(_, tail)| tail.rsplit_once(')'))
.map(|(field, _)| field)
}
/// Parses the v1 version of the Desc ranking rules `desc(price)`and returns the field name.
pub fn desc_ranking_rule(text: &str) -> Option<&str> {
text.split_once("desc(")
.and_then(|(_, tail)| tail.rsplit_once(')'))
.map(|(field, _)| field)
}

View File

@@ -1,152 +0,0 @@
use anyhow::bail;
use meilisearch_types::error::Code;
use milli::update::IndexDocumentsMethod;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use uuid::Uuid;
use crate::index::{Settings, Unchecked};
#[derive(Serialize, Deserialize)]
pub struct UpdateEntry {
pub uuid: Uuid,
pub update: UpdateStatus,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateFormat {
Json,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct DocumentAdditionResult {
pub nb_documents: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum UpdateResult {
DocumentsAddition(DocumentAdditionResult),
DocumentDeletion { deleted: u64 },
Other,
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum UpdateMeta {
DocumentsAddition {
method: IndexDocumentsMethod,
format: UpdateFormat,
primary_key: Option<String>,
},
ClearDocuments,
DeleteDocuments {
ids: Vec<String>,
},
Settings(Settings<Unchecked>),
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Enqueued {
pub update_id: u64,
pub meta: UpdateMeta,
#[serde(with = "time::serde::rfc3339")]
pub enqueued_at: OffsetDateTime,
pub content: Option<Uuid>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Processed {
pub success: UpdateResult,
#[serde(with = "time::serde::rfc3339")]
pub processed_at: OffsetDateTime,
#[serde(flatten)]
pub from: Processing,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Processing {
#[serde(flatten)]
pub from: Enqueued,
#[serde(with = "time::serde::rfc3339")]
pub started_processing_at: OffsetDateTime,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Aborted {
#[serde(flatten)]
pub from: Enqueued,
#[serde(with = "time::serde::rfc3339")]
pub aborted_at: OffsetDateTime,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Failed {
#[serde(flatten)]
pub from: Processing,
pub error: ResponseError,
#[serde(with = "time::serde::rfc3339")]
pub failed_at: OffsetDateTime,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "status", rename_all = "camelCase")]
pub enum UpdateStatus {
Processing(Processing),
Enqueued(Enqueued),
Processed(Processed),
Aborted(Aborted),
Failed(Failed),
}
type StatusCode = ();
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct ResponseError {
#[serde(skip)]
pub code: StatusCode,
pub message: String,
pub error_code: String,
pub error_type: String,
pub error_link: String,
}
pub fn error_code_from_str(s: &str) -> anyhow::Result<Code> {
let code = match s {
"index_creation_failed" => Code::CreateIndex,
"index_already_exists" => Code::IndexAlreadyExists,
"index_not_found" => Code::IndexNotFound,
"invalid_index_uid" => Code::InvalidIndexUid,
"invalid_state" => Code::InvalidState,
"missing_primary_key" => Code::MissingPrimaryKey,
"primary_key_already_present" => Code::PrimaryKeyAlreadyPresent,
"invalid_request" => Code::InvalidRankingRule,
"max_fields_limit_exceeded" => Code::MaxFieldsLimitExceeded,
"missing_document_id" => Code::MissingDocumentId,
"invalid_facet" => Code::Filter,
"invalid_filter" => Code::Filter,
"invalid_sort" => Code::Sort,
"bad_parameter" => Code::BadParameter,
"bad_request" => Code::BadRequest,
"document_not_found" => Code::DocumentNotFound,
"internal" => Code::Internal,
"invalid_geo_field" => Code::InvalidGeoField,
"invalid_token" => Code::InvalidToken,
"missing_authorization_header" => Code::MissingAuthorizationHeader,
"payload_too_large" => Code::PayloadTooLarge,
"unretrievable_document" => Code::RetrieveDocument,
"search_error" => Code::SearchDocuments,
"unsupported_media_type" => Code::UnsupportedMediaType,
"dump_already_in_progress" => Code::DumpAlreadyInProgress,
"dump_process_failed" => Code::DumpProcessFailed,
_ => bail!("unknow error code."),
};
Ok(code)
}

View File

@@ -1,205 +0,0 @@
use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::index_uid::IndexUid;
use milli::update::IndexDocumentsMethod;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use uuid::Uuid;
use super::v4::{Task, TaskContent, TaskEvent};
use crate::index::{Settings, Unchecked};
use crate::tasks::task::{DocumentDeletion, TaskId, TaskResult};
use super::v2;
#[derive(Serialize, Deserialize)]
pub struct DumpEntry {
pub uuid: Uuid,
pub uid: String,
}
#[derive(Serialize, Deserialize)]
pub struct UpdateEntry {
pub uuid: Uuid,
pub update: UpdateStatus,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "status", rename_all = "camelCase")]
pub enum UpdateStatus {
Processing(Processing),
Enqueued(Enqueued),
Processed(Processed),
Failed(Failed),
}
impl From<v2::UpdateResult> for TaskResult {
fn from(other: v2::UpdateResult) -> Self {
match other {
v2::UpdateResult::DocumentsAddition(result) => TaskResult::DocumentAddition {
indexed_documents: result.nb_documents as u64,
},
v2::UpdateResult::DocumentDeletion { deleted } => TaskResult::DocumentDeletion {
deleted_documents: deleted,
},
v2::UpdateResult::Other => TaskResult::Other,
}
}
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum Update {
DeleteDocuments(Vec<String>),
DocumentAddition {
primary_key: Option<String>,
method: IndexDocumentsMethod,
content_uuid: Uuid,
},
Settings(Settings<Unchecked>),
ClearDocuments,
}
impl From<Update> for super::v4::TaskContent {
fn from(update: Update) -> Self {
match update {
Update::DeleteDocuments(ids) => {
TaskContent::DocumentDeletion(DocumentDeletion::Ids(ids))
}
Update::DocumentAddition {
primary_key,
method,
..
} => TaskContent::DocumentAddition {
content_uuid: Uuid::default(),
merge_strategy: method,
primary_key,
// document count is unknown for legacy updates
documents_count: 0,
allow_index_creation: true,
},
Update::Settings(settings) => TaskContent::SettingsUpdate {
settings,
// There is no way to know now, so we assume it isn't
is_deletion: false,
allow_index_creation: true,
},
Update::ClearDocuments => TaskContent::DocumentDeletion(DocumentDeletion::Clear),
}
}
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type")]
pub enum UpdateMeta {
DocumentsAddition {
method: IndexDocumentsMethod,
primary_key: Option<String>,
},
ClearDocuments,
DeleteDocuments {
ids: Vec<String>,
},
Settings(Settings<Unchecked>),
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Enqueued {
pub update_id: u64,
pub meta: Update,
#[serde(with = "time::serde::rfc3339")]
pub enqueued_at: OffsetDateTime,
}
impl Enqueued {
fn update_task(self, task: &mut Task) {
// we do not erase the `TaskId` that was given to us.
task.content = self.meta.into();
task.events.push(TaskEvent::Created(self.enqueued_at));
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Processed {
pub success: v2::UpdateResult,
#[serde(with = "time::serde::rfc3339")]
pub processed_at: OffsetDateTime,
#[serde(flatten)]
pub from: Processing,
}
impl Processed {
fn update_task(self, task: &mut Task) {
self.from.update_task(task);
let event = TaskEvent::Succeded {
result: TaskResult::from(self.success),
timestamp: self.processed_at,
};
task.events.push(event);
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Processing {
#[serde(flatten)]
pub from: Enqueued,
#[serde(with = "time::serde::rfc3339")]
pub started_processing_at: OffsetDateTime,
}
impl Processing {
fn update_task(self, task: &mut Task) {
self.from.update_task(task);
let event = TaskEvent::Processing(self.started_processing_at);
task.events.push(event);
}
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Failed {
#[serde(flatten)]
pub from: Processing,
pub msg: String,
pub code: Code,
#[serde(with = "time::serde::rfc3339")]
pub failed_at: OffsetDateTime,
}
impl Failed {
fn update_task(self, task: &mut Task) {
self.from.update_task(task);
let event = TaskEvent::Failed {
error: ResponseError::from_msg(self.msg, self.code),
timestamp: self.failed_at,
};
task.events.push(event);
}
}
impl From<(UpdateStatus, String, TaskId)> for Task {
fn from((update, uid, task_id): (UpdateStatus, String, TaskId)) -> Self {
// Dummy task
let mut task = super::v4::Task {
id: task_id,
index_uid: IndexUid::new_unchecked(uid),
content: super::v4::TaskContent::IndexDeletion,
events: Vec::new(),
};
match update {
UpdateStatus::Processing(u) => u.update_task(&mut task),
UpdateStatus::Enqueued(u) => u.update_task(&mut task),
UpdateStatus::Processed(u) => u.update_task(&mut task),
UpdateStatus::Failed(u) => u.update_task(&mut task),
}
task
}
}

View File

@@ -1,145 +0,0 @@
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use milli::update::IndexDocumentsMethod;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use uuid::Uuid;
use crate::index::{Settings, Unchecked};
use crate::tasks::batch::BatchId;
use crate::tasks::task::{
DocumentDeletion, TaskContent as NewTaskContent, TaskEvent as NewTaskEvent, TaskId, TaskResult,
};
#[derive(Debug, Serialize, Deserialize)]
pub struct Task {
pub id: TaskId,
pub index_uid: IndexUid,
pub content: TaskContent,
pub events: Vec<TaskEvent>,
}
impl From<Task> for crate::tasks::task::Task {
fn from(other: Task) -> Self {
Self {
id: other.id,
content: NewTaskContent::from((other.index_uid, other.content)),
events: other.events.into_iter().map(Into::into).collect(),
}
}
}
#[derive(Debug, Serialize, Deserialize)]
pub enum TaskEvent {
Created(#[serde(with = "time::serde::rfc3339")] OffsetDateTime),
Batched {
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
batch_id: BatchId,
},
Processing(#[serde(with = "time::serde::rfc3339")] OffsetDateTime),
Succeded {
result: TaskResult,
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
},
Failed {
error: ResponseError,
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
},
}
impl From<TaskEvent> for NewTaskEvent {
fn from(other: TaskEvent) -> Self {
match other {
TaskEvent::Created(x) => NewTaskEvent::Created(x),
TaskEvent::Batched {
timestamp,
batch_id,
} => NewTaskEvent::Batched {
timestamp,
batch_id,
},
TaskEvent::Processing(x) => NewTaskEvent::Processing(x),
TaskEvent::Succeded { result, timestamp } => {
NewTaskEvent::Succeeded { result, timestamp }
}
TaskEvent::Failed { error, timestamp } => NewTaskEvent::Failed { error, timestamp },
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[allow(clippy::large_enum_variant)]
pub enum TaskContent {
DocumentAddition {
content_uuid: Uuid,
merge_strategy: IndexDocumentsMethod,
primary_key: Option<String>,
documents_count: usize,
allow_index_creation: bool,
},
DocumentDeletion(DocumentDeletion),
SettingsUpdate {
settings: Settings<Unchecked>,
/// Indicates whether the task was a deletion
is_deletion: bool,
allow_index_creation: bool,
},
IndexDeletion,
IndexCreation {
primary_key: Option<String>,
},
IndexUpdate {
primary_key: Option<String>,
},
Dump {
uid: String,
},
}
impl From<(IndexUid, TaskContent)> for NewTaskContent {
fn from((index_uid, content): (IndexUid, TaskContent)) -> Self {
match content {
TaskContent::DocumentAddition {
content_uuid,
merge_strategy,
primary_key,
documents_count,
allow_index_creation,
} => NewTaskContent::DocumentAddition {
index_uid,
content_uuid,
merge_strategy,
primary_key,
documents_count,
allow_index_creation,
},
TaskContent::DocumentDeletion(deletion) => NewTaskContent::DocumentDeletion {
index_uid,
deletion,
},
TaskContent::SettingsUpdate {
settings,
is_deletion,
allow_index_creation,
} => NewTaskContent::SettingsUpdate {
index_uid,
settings,
is_deletion,
allow_index_creation,
},
TaskContent::IndexDeletion => NewTaskContent::IndexDeletion { index_uid },
TaskContent::IndexCreation { primary_key } => NewTaskContent::IndexCreation {
index_uid,
primary_key,
},
TaskContent::IndexUpdate { primary_key } => NewTaskContent::IndexUpdate {
index_uid,
primary_key,
},
TaskContent::Dump { uid } => NewTaskContent::Dump { uid },
}
}
}

View File

@@ -1,42 +0,0 @@
use meilisearch_auth::error::AuthControllerError;
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::internal_error;
use crate::{index_resolver::error::IndexResolverError, tasks::error::TaskError};
pub type Result<T> = std::result::Result<T, DumpError>;
#[derive(thiserror::Error, Debug)]
pub enum DumpError {
#[error("An internal error has occurred. `{0}`.")]
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
#[error("{0}")]
IndexResolver(Box<IndexResolverError>),
}
internal_error!(
DumpError: milli::heed::Error,
std::io::Error,
tokio::task::JoinError,
tokio::sync::oneshot::error::RecvError,
serde_json::error::Error,
tempfile::PersistError,
fs_extra::error::Error,
AuthControllerError,
TaskError
);
impl From<IndexResolverError> for DumpError {
fn from(e: IndexResolverError) -> Self {
Self::IndexResolver(Box::new(e))
}
}
impl ErrorCode for DumpError {
fn error_code(&self) -> Code {
match self {
DumpError::Internal(_) => Code::Internal,
DumpError::IndexResolver(e) => e.error_code(),
}
}
}

View File

@@ -1,188 +0,0 @@
#[cfg(not(test))]
pub use real::DumpHandler;
#[cfg(test)]
pub use test::MockDumpHandler as DumpHandler;
use time::{macros::format_description, OffsetDateTime};
/// Generate uid from creation date
pub fn generate_uid() -> String {
OffsetDateTime::now_utc()
.format(format_description!(
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
))
.unwrap()
}
mod real {
use std::path::PathBuf;
use std::sync::Arc;
use log::{info, trace};
use meilisearch_auth::AuthController;
use milli::heed::Env;
use tokio::fs::create_dir_all;
use tokio::io::AsyncWriteExt;
use crate::analytics;
use crate::compression::to_tar_gz;
use crate::dump::error::{DumpError, Result};
use crate::dump::{MetadataVersion, META_FILE_NAME};
use crate::index_resolver::{
index_store::IndexStore, meta_store::IndexMetaStore, IndexResolver,
};
use crate::tasks::TaskStore;
use crate::update_file_store::UpdateFileStore;
pub struct DumpHandler<U, I> {
dump_path: PathBuf,
db_path: PathBuf,
update_file_store: UpdateFileStore,
task_store_size: usize,
index_db_size: usize,
env: Arc<Env>,
index_resolver: Arc<IndexResolver<U, I>>,
}
impl<U, I> DumpHandler<U, I>
where
U: IndexMetaStore + Sync + Send + 'static,
I: IndexStore + Sync + Send + 'static,
{
pub fn new(
dump_path: PathBuf,
db_path: PathBuf,
update_file_store: UpdateFileStore,
task_store_size: usize,
index_db_size: usize,
env: Arc<Env>,
index_resolver: Arc<IndexResolver<U, I>>,
) -> Self {
Self {
dump_path,
db_path,
update_file_store,
task_store_size,
index_db_size,
env,
index_resolver,
}
}
pub async fn run(&self, uid: String) -> Result<()> {
trace!("Performing dump.");
create_dir_all(&self.dump_path).await?;
let temp_dump_dir = tokio::task::spawn_blocking(tempfile::TempDir::new).await??;
let temp_dump_path = temp_dump_dir.path().to_owned();
let meta = MetadataVersion::new_v5(self.index_db_size, self.task_store_size);
let meta_path = temp_dump_path.join(META_FILE_NAME);
let meta_bytes = serde_json::to_vec(&meta)?;
let mut meta_file = tokio::fs::File::create(&meta_path).await?;
meta_file.write_all(&meta_bytes).await?;
analytics::copy_user_id(&self.db_path, &temp_dump_path);
create_dir_all(&temp_dump_path.join("indexes")).await?;
let db_path = self.db_path.clone();
let temp_dump_path_clone = temp_dump_path.clone();
tokio::task::spawn_blocking(move || -> Result<()> {
AuthController::dump(db_path, temp_dump_path_clone)?;
Ok(())
})
.await??;
TaskStore::dump(
self.env.clone(),
&temp_dump_path,
self.update_file_store.clone(),
)
.await?;
self.index_resolver.dump(&temp_dump_path).await?;
let dump_path = self.dump_path.clone();
let dump_path = tokio::task::spawn_blocking(move || -> Result<PathBuf> {
// for now we simply copy the updates/updates_files
// FIXME: We may copy more files than necessary, if new files are added while we are
// performing the dump. We need a way to filter them out.
let temp_dump_file = tempfile::NamedTempFile::new_in(&dump_path)?;
to_tar_gz(temp_dump_path, temp_dump_file.path())
.map_err(|e| DumpError::Internal(e.into()))?;
let dump_path = dump_path.join(uid).with_extension("dump");
temp_dump_file.persist(&dump_path)?;
Ok(dump_path)
})
.await??;
info!("Created dump in {:?}.", dump_path);
Ok(())
}
}
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
use std::sync::Arc;
use milli::heed::Env;
use nelson::Mocker;
use crate::dump::error::Result;
use crate::index_resolver::IndexResolver;
use crate::index_resolver::{index_store::IndexStore, meta_store::IndexMetaStore};
use crate::update_file_store::UpdateFileStore;
use super::*;
pub enum MockDumpHandler<U, I> {
Real(super::real::DumpHandler<U, I>),
Mock(Mocker),
}
impl<U, I> MockDumpHandler<U, I> {
pub fn mock(mocker: Mocker) -> Self {
Self::Mock(mocker)
}
}
impl<U, I> MockDumpHandler<U, I>
where
U: IndexMetaStore + Sync + Send + 'static,
I: IndexStore + Sync + Send + 'static,
{
pub fn new(
dump_path: PathBuf,
db_path: PathBuf,
update_file_store: UpdateFileStore,
task_store_size: usize,
index_db_size: usize,
env: Arc<Env>,
index_resolver: Arc<IndexResolver<U, I>>,
) -> Self {
Self::Real(super::real::DumpHandler::new(
dump_path,
db_path,
update_file_store,
task_store_size,
index_db_size,
env,
index_resolver,
))
}
pub async fn run(&self, uid: String) -> Result<()> {
match self {
DumpHandler::Real(real) => real.run(uid).await,
DumpHandler::Mock(mocker) => unsafe { mocker.get("run").call(uid) },
}
}
}
}

View File

@@ -1,4 +0,0 @@
pub mod v2;
pub mod v3;
pub mod v4;
pub mod v5;

View File

@@ -1,24 +0,0 @@
use std::path::Path;
use serde::{Deserialize, Serialize};
use crate::index_controller::IndexMetadata;
#[derive(Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct MetadataV1 {
pub db_version: String,
indexes: Vec<IndexMetadata>,
}
impl MetadataV1 {
#[allow(dead_code, unreachable_code, unused_variables)]
pub fn load_dump(
self,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
indexer_options: &IndexerOpts,
) -> anyhow::Result<()> {
anyhow::bail!("The version 1 of the dumps is not supported anymore. You can re-export your dump from a version between 0.21 and 0.24, or start fresh from a version 0.25 onwards.")
}

View File

@@ -1,216 +0,0 @@
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::path::{Path, PathBuf};
use serde_json::{Deserializer, Value};
use tempfile::NamedTempFile;
use crate::dump::compat::{self, v2, v3};
use crate::dump::Metadata;
use crate::options::IndexerOpts;
/// The dump v2 reads the dump folder and patches all the needed file to make it compatible with a
/// dump v3, then calls the dump v3 to actually handle the dump.
pub fn load_dump(
meta: Metadata,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
update_db_size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
log::info!("Patching dump V2 to dump V3...");
let indexes_path = src.as_ref().join("indexes");
let dir_entries = std::fs::read_dir(indexes_path)?;
for entry in dir_entries {
let entry = entry?;
// rename the index folder
let path = entry.path();
let new_path = patch_index_uuid_path(&path).expect("invalid index folder.");
std::fs::rename(path, &new_path)?;
let settings_path = new_path.join("meta.json");
patch_settings(settings_path)?;
}
let update_dir = src.as_ref().join("updates");
let update_path = update_dir.join("data.jsonl");
patch_updates(update_dir, update_path)?;
super::v3::load_dump(
meta,
src,
dst,
index_db_size,
update_db_size,
indexing_options,
)
}
fn patch_index_uuid_path(path: &Path) -> Option<PathBuf> {
let uuid = path.file_name()?.to_str()?.trim_start_matches("index-");
let new_path = path.parent()?.join(uuid);
Some(new_path)
}
fn patch_settings(path: impl AsRef<Path>) -> anyhow::Result<()> {
let mut meta_file = File::open(&path)?;
let mut meta: Value = serde_json::from_reader(&mut meta_file)?;
// We first deserialize the dump meta into a serde_json::Value and change
// the custom ranking rules settings from the old format to the new format.
if let Some(ranking_rules) = meta.pointer_mut("/settings/rankingRules") {
patch_custom_ranking_rules(ranking_rules);
}
let mut meta_file = OpenOptions::new().truncate(true).write(true).open(path)?;
serde_json::to_writer(&mut meta_file, &meta)?;
Ok(())
}
fn patch_updates(dir: impl AsRef<Path>, path: impl AsRef<Path>) -> anyhow::Result<()> {
let mut output_update_file = NamedTempFile::new_in(&dir)?;
let update_file = File::open(&path)?;
let stream = Deserializer::from_reader(update_file).into_iter::<v2::UpdateEntry>();
for update in stream {
let update_entry = update?;
let update_entry = v3::UpdateEntry::from(update_entry);
serde_json::to_writer(&mut output_update_file, &update_entry)?;
output_update_file.write_all(b"\n")?;
}
output_update_file.flush()?;
output_update_file.persist(path)?;
Ok(())
}
/// Converts the ranking rules from the format `asc(_)`, `desc(_)` to the format `_:asc`, `_:desc`.
///
/// This is done for compatibility reasons, and to avoid a new dump version,
/// since the new syntax was introduced soon after the new dump version.
fn patch_custom_ranking_rules(ranking_rules: &mut Value) {
*ranking_rules = match ranking_rules.take() {
Value::Array(values) => values
.into_iter()
.filter_map(|value| match value {
Value::String(s) if s.starts_with("asc") => compat::asc_ranking_rule(&s)
.map(|f| format!("{}:asc", f))
.map(Value::String),
Value::String(s) if s.starts_with("desc") => compat::desc_ranking_rule(&s)
.map(|f| format!("{}:desc", f))
.map(Value::String),
otherwise => Some(otherwise),
})
.collect(),
otherwise => otherwise,
}
}
impl From<v2::UpdateEntry> for v3::UpdateEntry {
fn from(v2::UpdateEntry { uuid, update }: v2::UpdateEntry) -> Self {
let update = match update {
v2::UpdateStatus::Processing(meta) => v3::UpdateStatus::Processing(meta.into()),
v2::UpdateStatus::Enqueued(meta) => v3::UpdateStatus::Enqueued(meta.into()),
v2::UpdateStatus::Processed(meta) => v3::UpdateStatus::Processed(meta.into()),
v2::UpdateStatus::Aborted(_) => unreachable!("Updates could never be aborted."),
v2::UpdateStatus::Failed(meta) => v3::UpdateStatus::Failed(meta.into()),
};
Self { uuid, update }
}
}
impl From<v2::Failed> for v3::Failed {
fn from(other: v2::Failed) -> Self {
let v2::Failed {
from,
error,
failed_at,
} = other;
Self {
from: from.into(),
msg: error.message,
code: v2::error_code_from_str(&error.error_code)
.expect("Invalid update: Invalid error code"),
failed_at,
}
}
}
impl From<v2::Processing> for v3::Processing {
fn from(other: v2::Processing) -> Self {
let v2::Processing {
from,
started_processing_at,
} = other;
Self {
from: from.into(),
started_processing_at,
}
}
}
impl From<v2::Enqueued> for v3::Enqueued {
fn from(other: v2::Enqueued) -> Self {
let v2::Enqueued {
update_id,
meta,
enqueued_at,
content,
} = other;
let meta = match meta {
v2::UpdateMeta::DocumentsAddition {
method,
primary_key,
..
} => {
v3::Update::DocumentAddition {
primary_key,
method,
// Just ignore if the uuid is no present. If it is needed later, an error will
// be thrown.
content_uuid: content.unwrap_or_default(),
}
}
v2::UpdateMeta::ClearDocuments => v3::Update::ClearDocuments,
v2::UpdateMeta::DeleteDocuments { ids } => v3::Update::DeleteDocuments(ids),
v2::UpdateMeta::Settings(settings) => v3::Update::Settings(settings),
};
Self {
update_id,
meta,
enqueued_at,
}
}
}
impl From<v2::Processed> for v3::Processed {
fn from(other: v2::Processed) -> Self {
let v2::Processed {
from,
success,
processed_at,
} = other;
Self {
success,
processed_at,
from: from.into(),
}
}
}

View File

@@ -1,136 +0,0 @@
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{BufReader, BufWriter, Write};
use std::path::Path;
use anyhow::Context;
use fs_extra::dir::{self, CopyOptions};
use log::info;
use tempfile::tempdir;
use uuid::Uuid;
use crate::dump::compat::{self, v3};
use crate::dump::Metadata;
use crate::index_resolver::meta_store::{DumpEntry, IndexMeta};
use crate::options::IndexerOpts;
use crate::tasks::task::TaskId;
/// dump structure for V3:
/// .
/// ├── indexes
/// │   └── 25f10bb8-6ea8-42f0-bd48-ad5857f77648
/// │   ├── documents.jsonl
/// │   └── meta.json
/// ├── index_uuids
/// │   └── data.jsonl
/// ├── metadata.json
/// └── updates
/// └── data.jsonl
pub fn load_dump(
meta: Metadata,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
meta_env_size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
info!("Patching dump V3 to dump V4...");
let patched_dir = tempdir()?;
let options = CopyOptions::default();
dir::copy(src.as_ref().join("indexes"), patched_dir.path(), &options)?;
dir::copy(
src.as_ref().join("index_uuids"),
patched_dir.path(),
&options,
)?;
let uuid_map = patch_index_meta(
src.as_ref().join("index_uuids/data.jsonl"),
patched_dir.path(),
)?;
fs::copy(
src.as_ref().join("metadata.json"),
patched_dir.path().join("metadata.json"),
)?;
patch_updates(&src, patched_dir.path(), uuid_map)?;
super::v4::load_dump(
meta,
patched_dir.path(),
dst,
index_db_size,
meta_env_size,
indexing_options,
)
}
fn patch_index_meta(
path: impl AsRef<Path>,
dst: impl AsRef<Path>,
) -> anyhow::Result<HashMap<Uuid, String>> {
let file = BufReader::new(File::open(path)?);
let dst = dst.as_ref().join("index_uuids");
fs::create_dir_all(&dst)?;
let mut dst_file = File::create(dst.join("data.jsonl"))?;
let map = serde_json::Deserializer::from_reader(file)
.into_iter::<v3::DumpEntry>()
.try_fold(HashMap::new(), |mut map, entry| -> anyhow::Result<_> {
let entry = entry?;
map.insert(entry.uuid, entry.uid.clone());
let meta = IndexMeta {
uuid: entry.uuid,
// This is lost information, we patch it to 0;
creation_task_id: 0,
};
let entry = DumpEntry {
uid: entry.uid,
index_meta: meta,
};
serde_json::to_writer(&mut dst_file, &entry)?;
dst_file.write_all(b"\n")?;
Ok(map)
})?;
dst_file.flush()?;
Ok(map)
}
fn patch_updates(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
uuid_map: HashMap<Uuid, String>,
) -> anyhow::Result<()> {
let dst = dst.as_ref().join("updates");
fs::create_dir_all(&dst)?;
let mut dst_file = BufWriter::new(File::create(dst.join("data.jsonl"))?);
let src_file = BufReader::new(File::open(src.as_ref().join("updates/data.jsonl"))?);
serde_json::Deserializer::from_reader(src_file)
.into_iter::<v3::UpdateEntry>()
.enumerate()
.try_for_each(|(task_id, entry)| -> anyhow::Result<()> {
let entry = entry?;
let name = uuid_map
.get(&entry.uuid)
.with_context(|| format!("Unknown index uuid: {}", entry.uuid))?
.clone();
serde_json::to_writer(
&mut dst_file,
&compat::v4::Task::from((entry.update, name, task_id as TaskId)),
)?;
dst_file.write_all(b"\n")?;
Ok(())
})?;
dst_file.flush()?;
Ok(())
}

View File

@@ -1,103 +0,0 @@
use std::fs::{self, create_dir_all, File};
use std::io::{BufReader, Write};
use std::path::Path;
use fs_extra::dir::{self, CopyOptions};
use log::info;
use serde_json::{Deserializer, Map, Value};
use tempfile::tempdir;
use uuid::Uuid;
use crate::dump::{compat, Metadata};
use crate::options::IndexerOpts;
use crate::tasks::task::Task;
pub fn load_dump(
meta: Metadata,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
meta_env_size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
info!("Patching dump V4 to dump V5...");
let patched_dir = tempdir()?;
let options = CopyOptions::default();
// Indexes
dir::copy(src.as_ref().join("indexes"), &patched_dir, &options)?;
// Index uuids
dir::copy(src.as_ref().join("index_uuids"), &patched_dir, &options)?;
// Metadata
fs::copy(
src.as_ref().join("metadata.json"),
patched_dir.path().join("metadata.json"),
)?;
// Updates
patch_updates(&src, &patched_dir)?;
// Keys
patch_keys(&src, &patched_dir)?;
super::v5::load_dump(
meta,
&patched_dir,
dst,
index_db_size,
meta_env_size,
indexing_options,
)
}
fn patch_updates(src: impl AsRef<Path>, dst: impl AsRef<Path>) -> anyhow::Result<()> {
let updates_path = src.as_ref().join("updates/data.jsonl");
let output_updates_path = dst.as_ref().join("updates/data.jsonl");
create_dir_all(output_updates_path.parent().unwrap())?;
let udpates_file = File::open(updates_path)?;
let mut output_update_file = File::create(output_updates_path)?;
serde_json::Deserializer::from_reader(udpates_file)
.into_iter::<compat::v4::Task>()
.try_for_each(|task| -> anyhow::Result<()> {
let task: Task = task?.into();
serde_json::to_writer(&mut output_update_file, &task)?;
output_update_file.write_all(b"\n")?;
Ok(())
})?;
output_update_file.flush()?;
Ok(())
}
fn patch_keys(src: impl AsRef<Path>, dst: impl AsRef<Path>) -> anyhow::Result<()> {
let keys_file_src = src.as_ref().join("keys");
if !keys_file_src.exists() {
return Ok(());
}
fs::create_dir_all(&dst)?;
let keys_file_dst = dst.as_ref().join("keys");
let mut writer = File::create(&keys_file_dst)?;
let reader = BufReader::new(File::open(&keys_file_src)?);
for key in Deserializer::from_reader(reader).into_iter() {
let mut key: Map<String, Value> = key?;
// generate a new uuid v4 and insert it in the key.
let uid = serde_json::to_value(Uuid::new_v4()).unwrap();
key.insert("uid".to_string(), uid);
serde_json::to_writer(&mut writer, &key)?;
writer.write_all(b"\n")?;
}
Ok(())
}

View File

@@ -1,47 +0,0 @@
use std::{path::Path, sync::Arc};
use log::info;
use meilisearch_auth::AuthController;
use milli::heed::EnvOpenOptions;
use crate::analytics;
use crate::dump::Metadata;
use crate::index_resolver::IndexResolver;
use crate::options::IndexerOpts;
use crate::tasks::TaskStore;
use crate::update_file_store::UpdateFileStore;
pub fn load_dump(
meta: Metadata,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
meta_env_size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
info!(
"Loading dump from {}, dump database version: {}, dump version: V5",
meta.dump_date, meta.db_version
);
let mut options = EnvOpenOptions::new();
options.map_size(meta_env_size);
options.max_dbs(100);
let env = Arc::new(options.open(&dst)?);
IndexResolver::load_dump(
src.as_ref(),
&dst,
index_db_size,
env.clone(),
indexing_options,
)?;
UpdateFileStore::load_dump(src.as_ref(), &dst)?;
TaskStore::load_dump(&src, env)?;
AuthController::load_dump(&src, &dst)?;
analytics::copy_user_id(src.as_ref(), dst.as_ref());
info!("Loading indexes.");
Ok(())
}

View File

@@ -1,262 +0,0 @@
use std::fs::File;
use std::path::Path;
use anyhow::bail;
use log::info;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use tempfile::TempDir;
use crate::compression::from_tar_gz;
use crate::options::IndexerOpts;
use self::loaders::{v2, v3, v4, v5};
pub use handler::{generate_uid, DumpHandler};
mod compat;
pub mod error;
mod handler;
mod loaders;
const META_FILE_NAME: &str = "metadata.json";
#[derive(Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Metadata {
db_version: String,
index_db_size: usize,
update_db_size: usize,
#[serde(with = "time::serde::rfc3339")]
dump_date: OffsetDateTime,
}
impl Metadata {
pub fn new(index_db_size: usize, update_db_size: usize) -> Self {
Self {
db_version: env!("CARGO_PKG_VERSION").to_string(),
index_db_size,
update_db_size,
dump_date: OffsetDateTime::now_utc(),
}
}
}
#[derive(Serialize, Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct MetadataV1 {
pub db_version: String,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(tag = "dumpVersion")]
pub enum MetadataVersion {
V1(MetadataV1),
V2(Metadata),
V3(Metadata),
V4(Metadata),
// V5 is forward compatible with V4 but not backward compatible.
V5(Metadata),
}
impl MetadataVersion {
pub fn load_dump(
self,
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
meta_env_size: usize,
indexing_options: &IndexerOpts,
) -> anyhow::Result<()> {
match self {
MetadataVersion::V1(_meta) => {
anyhow::bail!("The version 1 of the dumps is not supported anymore. You can re-export your dump from a version between 0.21 and 0.24, or start fresh from a version 0.25 onwards.")
}
MetadataVersion::V2(meta) => v2::load_dump(
meta,
src,
dst,
index_db_size,
meta_env_size,
indexing_options,
)?,
MetadataVersion::V3(meta) => v3::load_dump(
meta,
src,
dst,
index_db_size,
meta_env_size,
indexing_options,
)?,
MetadataVersion::V4(meta) => v4::load_dump(
meta,
src,
dst,
index_db_size,
meta_env_size,
indexing_options,
)?,
MetadataVersion::V5(meta) => v5::load_dump(
meta,
src,
dst,
index_db_size,
meta_env_size,
indexing_options,
)?,
}
Ok(())
}
pub fn new_v5(index_db_size: usize, update_db_size: usize) -> Self {
let meta = Metadata::new(index_db_size, update_db_size);
Self::V5(meta)
}
pub fn db_version(&self) -> &str {
match self {
Self::V1(meta) => &meta.db_version,
Self::V2(meta) | Self::V3(meta) | Self::V4(meta) | Self::V5(meta) => &meta.db_version,
}
}
pub fn version(&self) -> &'static str {
match self {
MetadataVersion::V1(_) => "V1",
MetadataVersion::V2(_) => "V2",
MetadataVersion::V3(_) => "V3",
MetadataVersion::V4(_) => "V4",
MetadataVersion::V5(_) => "V5",
}
}
pub fn dump_date(&self) -> Option<&OffsetDateTime> {
match self {
MetadataVersion::V1(_) => None,
MetadataVersion::V2(meta)
| MetadataVersion::V3(meta)
| MetadataVersion::V4(meta)
| MetadataVersion::V5(meta) => Some(&meta.dump_date),
}
}
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
#[serde(rename_all = "snake_case")]
pub enum DumpStatus {
Done,
InProgress,
Failed,
}
pub fn load_dump(
dst_path: impl AsRef<Path>,
src_path: impl AsRef<Path>,
ignore_dump_if_db_exists: bool,
ignore_missing_dump: bool,
index_db_size: usize,
update_db_size: usize,
indexer_opts: &IndexerOpts,
) -> anyhow::Result<()> {
let empty_db = crate::is_empty_db(&dst_path);
let src_path_exists = src_path.as_ref().exists();
if empty_db && src_path_exists {
let (tmp_src, tmp_dst, meta) = extract_dump(&dst_path, &src_path)?;
meta.load_dump(
tmp_src.path(),
tmp_dst.path(),
index_db_size,
update_db_size,
indexer_opts,
)?;
persist_dump(&dst_path, tmp_dst)?;
Ok(())
} else if !empty_db && !ignore_dump_if_db_exists {
bail!(
"database already exists at {:?}, try to delete it or rename it",
dst_path
.as_ref()
.canonicalize()
.unwrap_or_else(|_| dst_path.as_ref().to_owned())
)
} else if !src_path_exists && !ignore_missing_dump {
bail!("dump doesn't exist at {:?}", src_path.as_ref())
} else {
// there is nothing to do
Ok(())
}
}
fn extract_dump(
dst_path: impl AsRef<Path>,
src_path: impl AsRef<Path>,
) -> anyhow::Result<(TempDir, TempDir, MetadataVersion)> {
// Setup a temp directory path in the same path as the database, to prevent cross devices
// references.
let temp_path = dst_path
.as_ref()
.parent()
.map(ToOwned::to_owned)
.unwrap_or_else(|| ".".into());
let tmp_src = tempfile::tempdir_in(temp_path)?;
let tmp_src_path = tmp_src.path();
from_tar_gz(&src_path, tmp_src_path)?;
let meta_path = tmp_src_path.join(META_FILE_NAME);
let mut meta_file = File::open(&meta_path)?;
let meta: MetadataVersion = serde_json::from_reader(&mut meta_file)?;
if !dst_path.as_ref().exists() {
std::fs::create_dir_all(dst_path.as_ref())?;
}
let tmp_dst = tempfile::tempdir_in(dst_path.as_ref())?;
info!(
"Loading dump {}, dump database version: {}, dump version: {}",
meta.dump_date()
.map(|t| format!("from {}", t))
.unwrap_or_else(String::new),
meta.db_version(),
meta.version()
);
Ok((tmp_src, tmp_dst, meta))
}
fn persist_dump(dst_path: impl AsRef<Path>, tmp_dst: TempDir) -> anyhow::Result<()> {
let persisted_dump = tmp_dst.into_path();
// Delete everything in the `data.ms` except the tempdir.
if dst_path.as_ref().exists() {
for file in dst_path.as_ref().read_dir().unwrap() {
let file = file.unwrap().path();
if file.file_name() == persisted_dump.file_name() {
continue;
}
if file.is_file() {
std::fs::remove_file(&file)?;
} else {
std::fs::remove_dir_all(&file)?;
}
}
}
// Move the whole content of the tempdir into the `data.ms`.
for file in persisted_dump.read_dir().unwrap() {
let file = file.unwrap().path();
std::fs::rename(&file, &dst_path.as_ref().join(file.file_name().unwrap()))?;
}
// Delete the empty tempdir.
std::fs::remove_dir_all(&persisted_dump)?;
Ok(())
}

View File

@@ -1,55 +0,0 @@
use std::error::Error;
use std::fmt;
use meilisearch_types::error::{Code, ErrorCode};
use milli::UserError;
#[derive(Debug)]
pub struct MilliError<'a>(pub &'a milli::Error);
impl Error for MilliError<'_> {}
impl fmt::Display for MilliError<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
impl ErrorCode for MilliError<'_> {
fn error_code(&self) -> Code {
match self.0 {
milli::Error::InternalError(_) => Code::Internal,
milli::Error::IoError(_) => Code::Internal,
milli::Error::UserError(ref error) => {
match error {
// TODO: wait for spec for new error codes.
UserError::SerdeJson(_)
| UserError::InvalidLmdbOpenOptions
| UserError::DocumentLimitReached
| UserError::AccessingSoftDeletedDocument { .. }
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
UserError::InvalidStoreFile => Code::InvalidStore,
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
UserError::MaxDatabaseSizeReached => Code::DatabaseSizeLimitReached,
UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded,
UserError::InvalidFilter(_) => Code::Filter,
UserError::MissingDocumentId { .. } => Code::MissingDocumentId,
UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => {
Code::InvalidDocumentId
}
UserError::MissingPrimaryKey => Code::MissingPrimaryKey,
UserError::PrimaryKeyCannotBeChanged(_) => Code::PrimaryKeyAlreadyPresent,
UserError::SortRankingRuleMissing => Code::Sort,
UserError::InvalidFacetsDistribution { .. } => Code::BadRequest,
UserError::InvalidSortableAttribute { .. } => Code::Sort,
UserError::CriterionError(_) => Code::InvalidRankingRule,
UserError::InvalidGeoField { .. } => Code::InvalidGeoField,
UserError::SortError(_) => Code::Sort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidMinWordLengthForTypo
}
}
}
}
}
}

View File

@@ -1,160 +0,0 @@
use std::fs::{create_dir_all, File};
use std::io::{BufReader, Seek, SeekFrom, Write};
use std::path::Path;
use anyhow::Context;
use indexmap::IndexMap;
use milli::documents::DocumentsBatchReader;
use milli::heed::{EnvOpenOptions, RoTxn};
use milli::update::{IndexDocumentsConfig, IndexerConfig};
use serde::{Deserialize, Serialize};
use crate::document_formats::read_ndjson;
use crate::index::updates::apply_settings_to_builder;
use super::error::Result;
use super::{index::Index, Settings, Unchecked};
#[derive(Serialize, Deserialize)]
struct DumpMeta {
settings: Settings<Unchecked>,
primary_key: Option<String>,
}
const META_FILE_NAME: &str = "meta.json";
const DATA_FILE_NAME: &str = "documents.jsonl";
impl Index {
pub fn dump(&self, path: impl AsRef<Path>) -> Result<()> {
// acquire write txn make sure any ongoing write is finished before we start.
let txn = self.write_txn()?;
let path = path.as_ref().join(format!("indexes/{}", self.uuid));
create_dir_all(&path)?;
self.dump_documents(&txn, &path)?;
self.dump_meta(&txn, &path)?;
Ok(())
}
fn dump_documents(&self, txn: &RoTxn, path: impl AsRef<Path>) -> Result<()> {
let document_file_path = path.as_ref().join(DATA_FILE_NAME);
let mut document_file = File::create(&document_file_path)?;
let documents = self.all_documents(txn)?;
let fields_ids_map = self.fields_ids_map(txn)?;
// dump documents
let mut json_map = IndexMap::new();
for document in documents {
let (_, reader) = document?;
for (fid, bytes) in reader.iter() {
if let Some(name) = fields_ids_map.name(fid) {
json_map.insert(name, serde_json::from_slice::<serde_json::Value>(bytes)?);
}
}
serde_json::to_writer(&mut document_file, &json_map)?;
document_file.write_all(b"\n")?;
json_map.clear();
}
Ok(())
}
fn dump_meta(&self, txn: &RoTxn, path: impl AsRef<Path>) -> Result<()> {
let meta_file_path = path.as_ref().join(META_FILE_NAME);
let mut meta_file = File::create(&meta_file_path)?;
let settings = self.settings_txn(txn)?.into_unchecked();
let primary_key = self.primary_key(txn)?.map(String::from);
let meta = DumpMeta {
settings,
primary_key,
};
serde_json::to_writer(&mut meta_file, &meta)?;
Ok(())
}
pub fn load_dump(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
indexer_config: &IndexerConfig,
) -> anyhow::Result<()> {
let dir_name = src
.as_ref()
.file_name()
.with_context(|| format!("invalid dump index: {}", src.as_ref().display()))?;
let dst_dir_path = dst.as_ref().join("indexes").join(dir_name);
create_dir_all(&dst_dir_path)?;
let meta_path = src.as_ref().join(META_FILE_NAME);
let meta_file = File::open(meta_path)?;
let DumpMeta {
settings,
primary_key,
} = serde_json::from_reader(meta_file)?;
let settings = settings.check();
let mut options = EnvOpenOptions::new();
options.map_size(size);
let index = milli::Index::new(options, &dst_dir_path)?;
let mut txn = index.write_txn()?;
// Apply settings first
let mut builder = milli::update::Settings::new(&mut txn, &index, indexer_config);
if let Some(primary_key) = primary_key {
builder.set_primary_key(primary_key);
}
apply_settings_to_builder(&settings, &mut builder);
builder.execute(|_| ())?;
let document_file_path = src.as_ref().join(DATA_FILE_NAME);
let reader = BufReader::new(File::open(&document_file_path)?);
let mut tmp_doc_file = tempfile::tempfile()?;
let empty = match read_ndjson(reader, &mut tmp_doc_file) {
// if there was no document in the file it's because the index was empty
Ok(0) => true,
Ok(_) => false,
Err(e) => return Err(e.into()),
};
if !empty {
tmp_doc_file.seek(SeekFrom::Start(0))?;
let documents_reader = DocumentsBatchReader::from_reader(tmp_doc_file)?;
//If the document file is empty, we don't perform the document addition, to prevent
//a primary key error to be thrown.
let config = IndexDocumentsConfig::default();
let builder = milli::update::IndexDocuments::new(
&mut txn,
&index,
indexer_config,
config,
|_| (),
)?;
let (builder, user_error) = builder.add_documents(documents_reader)?;
user_error?;
builder.execute()?;
}
txn.commit()?;
index.prepare_for_closing().wait();
Ok(())
}
}

View File

@@ -1,61 +0,0 @@
use std::error::Error;
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::internal_error;
use serde_json::Value;
use crate::{error::MilliError, update_file_store};
pub type Result<T> = std::result::Result<T, IndexError>;
#[derive(Debug, thiserror::Error)]
pub enum IndexError {
#[error("An internal error has occurred. `{0}`.")]
Internal(Box<dyn Error + Send + Sync + 'static>),
#[error("Document `{0}` not found.")]
DocumentNotFound(String),
#[error("{0}")]
Facet(#[from] FacetError),
#[error("{0}")]
Milli(#[from] milli::Error),
}
internal_error!(
IndexError: std::io::Error,
milli::heed::Error,
fst::Error,
serde_json::Error,
update_file_store::UpdateFileStoreError,
milli::documents::Error
);
impl ErrorCode for IndexError {
fn error_code(&self) -> Code {
match self {
IndexError::Internal(_) => Code::Internal,
IndexError::DocumentNotFound(_) => Code::DocumentNotFound,
IndexError::Facet(e) => e.error_code(),
IndexError::Milli(e) => MilliError(e).error_code(),
}
}
}
impl From<milli::UserError> for IndexError {
fn from(error: milli::UserError) -> IndexError {
IndexError::Milli(error.into())
}
}
#[derive(Debug, thiserror::Error)]
pub enum FacetError {
#[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))]
InvalidExpression(&'static [&'static str], Value),
}
impl ErrorCode for FacetError {
fn error_code(&self) -> Code {
match self {
FacetError::InvalidExpression(_, _) => Code::Filter,
}
}
}

View File

@@ -1,332 +0,0 @@
use std::collections::BTreeSet;
use std::fs::create_dir_all;
use std::marker::PhantomData;
use std::ops::Deref;
use std::path::Path;
use std::sync::Arc;
use fst::IntoStreamer;
use milli::heed::{CompactionOption, EnvOpenOptions, RoTxn};
use milli::update::{IndexerConfig, Setting};
use milli::{obkv_to_json, FieldDistribution, DEFAULT_VALUES_PER_FACET};
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use time::OffsetDateTime;
use uuid::Uuid;
use walkdir::WalkDir;
use crate::index::search::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
use super::error::IndexError;
use super::error::Result;
use super::updates::{FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, TypoSettings};
use super::{Checked, Settings};
pub type Document = Map<String, Value>;
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct IndexMeta {
#[serde(with = "time::serde::rfc3339")]
pub created_at: OffsetDateTime,
#[serde(with = "time::serde::rfc3339")]
pub updated_at: OffsetDateTime,
pub primary_key: Option<String>,
}
impl IndexMeta {
pub fn new(index: &Index) -> Result<Self> {
let txn = index.read_txn()?;
Self::new_txn(index, &txn)
}
pub fn new_txn(index: &Index, txn: &milli::heed::RoTxn) -> Result<Self> {
let created_at = index.created_at(txn)?;
let updated_at = index.updated_at(txn)?;
let primary_key = index.primary_key(txn)?.map(String::from);
Ok(Self {
created_at,
updated_at,
primary_key,
})
}
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct IndexStats {
#[serde(skip)]
pub size: u64,
pub number_of_documents: u64,
/// Whether the current index is performing an update. It is initially `None` when the
/// index returns it, since it is the `UpdateStore` that knows what index is currently indexing. It is
/// later set to either true or false, we we retrieve the information from the `UpdateStore`
pub is_indexing: Option<bool>,
pub field_distribution: FieldDistribution,
}
#[derive(Clone, derivative::Derivative)]
#[derivative(Debug)]
pub struct Index {
pub uuid: Uuid,
#[derivative(Debug = "ignore")]
pub inner: Arc<milli::Index>,
#[derivative(Debug = "ignore")]
pub indexer_config: Arc<IndexerConfig>,
}
impl Deref for Index {
type Target = milli::Index;
fn deref(&self) -> &Self::Target {
self.inner.as_ref()
}
}
impl Index {
pub fn open(
path: impl AsRef<Path>,
size: usize,
uuid: Uuid,
update_handler: Arc<IndexerConfig>,
) -> Result<Self> {
log::debug!("opening index in {}", path.as_ref().display());
create_dir_all(&path)?;
let mut options = EnvOpenOptions::new();
options.map_size(size);
let inner = Arc::new(milli::Index::new(options, &path)?);
Ok(Index {
inner,
uuid,
indexer_config: update_handler,
})
}
/// Asynchronously close the underlying index
pub fn close(self) {
self.inner.as_ref().clone().prepare_for_closing();
}
pub fn stats(&self) -> Result<IndexStats> {
let rtxn = self.read_txn()?;
Ok(IndexStats {
size: self.size(),
number_of_documents: self.number_of_documents(&rtxn)?,
is_indexing: None,
field_distribution: self.field_distribution(&rtxn)?,
})
}
pub fn meta(&self) -> Result<IndexMeta> {
IndexMeta::new(self)
}
pub fn settings(&self) -> Result<Settings<Checked>> {
let txn = self.read_txn()?;
self.settings_txn(&txn)
}
pub fn uuid(&self) -> Uuid {
self.uuid
}
pub fn settings_txn(&self, txn: &RoTxn) -> Result<Settings<Checked>> {
let displayed_attributes = self
.displayed_fields(txn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let searchable_attributes = self
.user_defined_searchable_fields(txn)?
.map(|fields| fields.into_iter().map(String::from).collect());
let filterable_attributes = self.filterable_fields(txn)?.into_iter().collect();
let sortable_attributes = self.sortable_fields(txn)?.into_iter().collect();
let criteria = self
.criteria(txn)?
.into_iter()
.map(|c| c.to_string())
.collect();
let stop_words = self
.stop_words(txn)?
.map(|stop_words| -> Result<BTreeSet<_>> {
Ok(stop_words.stream().into_strs()?.into_iter().collect())
})
.transpose()?
.unwrap_or_default();
let distinct_field = self.distinct_field(txn)?.map(String::from);
// in milli each word in the synonyms map were split on their separator. Since we lost
// this information we are going to put space between words.
let synonyms = self
.synonyms(txn)?
.iter()
.map(|(key, values)| {
(
key.join(" "),
values.iter().map(|value| value.join(" ")).collect(),
)
})
.collect();
let min_typo_word_len = MinWordSizeTyposSetting {
one_typo: Setting::Set(self.min_word_len_one_typo(txn)?),
two_typos: Setting::Set(self.min_word_len_two_typos(txn)?),
};
let disabled_words = match self.exact_words(txn)? {
Some(fst) => fst.into_stream().into_strs()?.into_iter().collect(),
None => BTreeSet::new(),
};
let disabled_attributes = self
.exact_attributes(txn)?
.into_iter()
.map(String::from)
.collect();
let typo_tolerance = TypoSettings {
enabled: Setting::Set(self.authorize_typos(txn)?),
min_word_size_for_typos: Setting::Set(min_typo_word_len),
disable_on_words: Setting::Set(disabled_words),
disable_on_attributes: Setting::Set(disabled_attributes),
};
let faceting = FacetingSettings {
max_values_per_facet: Setting::Set(
self.max_values_per_facet(txn)?
.unwrap_or(DEFAULT_VALUES_PER_FACET),
),
};
let pagination = PaginationSettings {
max_total_hits: Setting::Set(
self.pagination_max_total_hits(txn)?
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
),
};
Ok(Settings {
displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs),
None => Setting::Reset,
},
filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria),
stop_words: Setting::Set(stop_words),
distinct_attribute: match distinct_field {
Some(field) => Setting::Set(field),
None => Setting::Reset,
},
synonyms: Setting::Set(synonyms),
typo_tolerance: Setting::Set(typo_tolerance),
faceting: Setting::Set(faceting),
pagination: Setting::Set(pagination),
_kind: PhantomData,
})
}
/// Return the total number of documents contained in the index + the selected documents.
pub fn retrieve_documents<S: AsRef<str>>(
&self,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<(u64, Vec<Document>)> {
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let mut documents = Vec::new();
for entry in self.all_documents(&txn)?.skip(offset).take(limit) {
let (_id, obkv) = entry?;
let document = obkv_to_json(&all_fields, &fields_ids_map, obkv)?;
let document = match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document,
attributes_to_retrieve.iter().map(|s| s.as_ref()),
),
None => document,
};
documents.push(document);
}
let number_of_documents = self.number_of_documents(&txn)?;
Ok((number_of_documents, documents))
}
pub fn retrieve_document<S: AsRef<str>>(
&self,
doc_id: String,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Document> {
let txn = self.read_txn()?;
let fields_ids_map = self.fields_ids_map(&txn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = self
.external_documents_ids(&txn)?
.get(doc_id.as_bytes())
.ok_or_else(|| IndexError::DocumentNotFound(doc_id.clone()))?;
let document = self
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next()
.map(|(_, d)| d)
.ok_or(IndexError::DocumentNotFound(doc_id))?;
let document = obkv_to_json(&all_fields, &fields_ids_map, document)?;
let document = match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document,
attributes_to_retrieve.iter().map(|s| s.as_ref()),
),
None => document,
};
Ok(document)
}
pub fn size(&self) -> u64 {
WalkDir::new(self.path())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len())
}
pub fn snapshot(&self, path: impl AsRef<Path>) -> Result<()> {
let mut dst = path.as_ref().join(format!("indexes/{}/", self.uuid));
create_dir_all(&dst)?;
dst.push("data.mdb");
let _txn = self.write_txn()?;
self.inner.copy_to_path(dst, CompactionOption::Enabled)?;
Ok(())
}
}
/// When running tests, when a server instance is dropped, the environment is not actually closed,
/// leaving a lot of open file descriptors.
impl Drop for Index {
fn drop(&mut self) {
// When dropping the last instance of an index, we want to close the index
// Note that the close is actually performed only if all the instances a effectively
// dropped
if Arc::strong_count(&self.inner) == 1 {
self.inner.as_ref().clone().prepare_for_closing();
}
}
}

View File

@@ -1,249 +0,0 @@
pub use search::{
MatchingStrategy, SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
};
pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecked};
mod dump;
pub mod error;
mod search;
pub mod updates;
#[allow(clippy::module_inception)]
mod index;
pub use index::{Document, IndexMeta, IndexStats};
#[cfg(not(test))]
pub use index::Index;
#[cfg(test)]
pub use test::MockIndex as Index;
/// The index::test module provides means of mocking an index instance. I can be used throughout the
/// code for unit testing, in places where an index would normally be used.
#[cfg(test)]
pub mod test {
use std::path::{Path, PathBuf};
use std::sync::Arc;
use milli::update::{
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsMethod, IndexerConfig,
};
use nelson::Mocker;
use uuid::Uuid;
use super::error::Result;
use super::index::Index;
use super::Document;
use super::{Checked, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings};
use crate::update_file_store::UpdateFileStore;
#[derive(Clone)]
pub enum MockIndex {
Real(Index),
Mock(Arc<Mocker>),
}
impl MockIndex {
pub fn mock(mocker: Mocker) -> Self {
Self::Mock(Arc::new(mocker))
}
pub fn open(
path: impl AsRef<Path>,
size: usize,
uuid: Uuid,
update_handler: Arc<IndexerConfig>,
) -> Result<Self> {
let index = Index::open(path, size, uuid, update_handler)?;
Ok(Self::Real(index))
}
pub fn load_dump(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
size: usize,
update_handler: &IndexerConfig,
) -> anyhow::Result<()> {
Index::load_dump(src, dst, size, update_handler)
}
pub fn uuid(&self) -> Uuid {
match self {
MockIndex::Real(index) => index.uuid(),
MockIndex::Mock(m) => unsafe { m.get("uuid").call(()) },
}
}
pub fn stats(&self) -> Result<IndexStats> {
match self {
MockIndex::Real(index) => index.stats(),
MockIndex::Mock(m) => unsafe { m.get("stats").call(()) },
}
}
pub fn meta(&self) -> Result<IndexMeta> {
match self {
MockIndex::Real(index) => index.meta(),
MockIndex::Mock(_) => todo!(),
}
}
pub fn settings(&self) -> Result<Settings<Checked>> {
match self {
MockIndex::Real(index) => index.settings(),
MockIndex::Mock(_) => todo!(),
}
}
pub fn retrieve_documents<S: AsRef<str>>(
&self,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<(u64, Vec<Document>)> {
match self {
MockIndex::Real(index) => {
index.retrieve_documents(offset, limit, attributes_to_retrieve)
}
MockIndex::Mock(_) => todo!(),
}
}
pub fn retrieve_document<S: AsRef<str>>(
&self,
doc_id: String,
attributes_to_retrieve: Option<Vec<S>>,
) -> Result<Document> {
match self {
MockIndex::Real(index) => index.retrieve_document(doc_id, attributes_to_retrieve),
MockIndex::Mock(_) => todo!(),
}
}
pub fn size(&self) -> u64 {
match self {
MockIndex::Real(index) => index.size(),
MockIndex::Mock(_) => todo!(),
}
}
pub fn snapshot(&self, path: impl AsRef<Path>) -> Result<()> {
match self {
MockIndex::Real(index) => index.snapshot(path),
MockIndex::Mock(m) => unsafe { m.get("snapshot").call(path.as_ref()) },
}
}
pub fn close(self) {
match self {
MockIndex::Real(index) => index.close(),
MockIndex::Mock(m) => unsafe { m.get("close").call(()) },
}
}
pub fn perform_search(&self, query: SearchQuery) -> Result<SearchResult> {
match self {
MockIndex::Real(index) => index.perform_search(query),
MockIndex::Mock(m) => unsafe { m.get("perform_search").call(query) },
}
}
pub fn dump(&self, path: impl AsRef<Path>) -> Result<()> {
match self {
MockIndex::Real(index) => index.dump(path),
MockIndex::Mock(m) => unsafe { m.get("dump").call(path.as_ref()) },
}
}
pub fn update_documents(
&self,
method: IndexDocumentsMethod,
primary_key: Option<String>,
file_store: UpdateFileStore,
contents: impl Iterator<Item = Uuid>,
) -> Result<Vec<Result<DocumentAdditionResult>>> {
match self {
MockIndex::Real(index) => {
index.update_documents(method, primary_key, file_store, contents)
}
MockIndex::Mock(mocker) => unsafe {
mocker
.get("update_documents")
.call((method, primary_key, file_store, contents))
},
}
}
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
match self {
MockIndex::Real(index) => index.update_settings(settings),
MockIndex::Mock(m) => unsafe { m.get("update_settings").call(settings) },
}
}
pub fn update_primary_key(&self, primary_key: String) -> Result<IndexMeta> {
match self {
MockIndex::Real(index) => index.update_primary_key(primary_key),
MockIndex::Mock(m) => unsafe { m.get("update_primary_key").call(primary_key) },
}
}
pub fn delete_documents(&self, ids: &[String]) -> Result<DocumentDeletionResult> {
match self {
MockIndex::Real(index) => index.delete_documents(ids),
MockIndex::Mock(m) => unsafe { m.get("delete_documents").call(ids) },
}
}
pub fn clear_documents(&self) -> Result<()> {
match self {
MockIndex::Real(index) => index.clear_documents(),
MockIndex::Mock(m) => unsafe { m.get("clear_documents").call(()) },
}
}
}
#[test]
fn test_faux_index() {
let faux = Mocker::default();
faux.when("snapshot")
.times(2)
.then(|_: &Path| -> Result<()> { Ok(()) });
let index = MockIndex::mock(faux);
let path = PathBuf::from("hello");
index.snapshot(&path).unwrap();
index.snapshot(&path).unwrap();
}
#[test]
#[should_panic]
fn test_faux_unexisting_method_stub() {
let faux = Mocker::default();
let index = MockIndex::mock(faux);
let path = PathBuf::from("hello");
index.snapshot(&path).unwrap();
index.snapshot(&path).unwrap();
}
#[test]
#[should_panic]
fn test_faux_panic() {
let faux = Mocker::default();
faux.when("snapshot")
.times(2)
.then(|_: &Path| -> Result<()> {
panic!();
});
let index = MockIndex::mock(faux);
let path = PathBuf::from("hello");
index.snapshot(&path).unwrap();
index.snapshot(&path).unwrap();
}
}

View File

@@ -1,688 +0,0 @@
use std::cmp::min;
use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::str::FromStr;
use std::time::Instant;
use either::Either;
use milli::tokenizer::TokenizerBuilder;
use milli::{
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError,
TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
};
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use crate::index::error::FacetError;
use super::error::{IndexError, Result};
use super::index::Index;
pub type Document = serde_json::Map<String, Value>;
type MatchesPosition = BTreeMap<String, Vec<MatchBounds>>;
pub const DEFAULT_SEARCH_LIMIT: fn() -> usize = || 20;
pub const DEFAULT_CROP_LENGTH: fn() -> usize = || 10;
pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
/// The maximimum number of results that the engine
/// will be able to return in one search call.
pub const DEFAULT_PAGINATION_MAX_TOTAL_HITS: usize = 1000;
#[derive(Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SearchQuery {
pub q: Option<String>,
pub offset: Option<usize>,
#[serde(default = "DEFAULT_SEARCH_LIMIT")]
pub limit: usize,
pub attributes_to_retrieve: Option<BTreeSet<String>>,
pub attributes_to_crop: Option<Vec<String>>,
#[serde(default = "DEFAULT_CROP_LENGTH")]
pub crop_length: usize,
pub attributes_to_highlight: Option<HashSet<String>>,
// Default to false
#[serde(default = "Default::default")]
pub show_matches_position: bool,
pub filter: Option<Value>,
pub sort: Option<Vec<String>>,
pub facets: Option<Vec<String>>,
#[serde(default = "DEFAULT_HIGHLIGHT_PRE_TAG")]
pub highlight_pre_tag: String,
#[serde(default = "DEFAULT_HIGHLIGHT_POST_TAG")]
pub highlight_post_tag: String,
#[serde(default = "DEFAULT_CROP_MARKER")]
pub crop_marker: String,
#[serde(default)]
pub matching_strategy: MatchingStrategy,
}
#[derive(Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub enum MatchingStrategy {
/// Remove query words from last to first
Last,
/// All query words are mandatory
All,
}
impl Default for MatchingStrategy {
fn default() -> Self {
Self::Last
}
}
impl From<MatchingStrategy> for TermsMatchingStrategy {
fn from(other: MatchingStrategy) -> Self {
match other {
MatchingStrategy::Last => Self::Last,
MatchingStrategy::All => Self::All,
}
}
}
#[derive(Debug, Clone, Serialize, PartialEq)]
pub struct SearchHit {
#[serde(flatten)]
pub document: Document,
#[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")]
pub formatted: Document,
#[serde(rename = "_matchesPosition", skip_serializing_if = "Option::is_none")]
pub matches_position: Option<MatchesPosition>,
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub estimated_total_hits: u64,
pub query: String,
pub limit: usize,
pub offset: usize,
pub processing_time_ms: u128,
#[serde(skip_serializing_if = "Option::is_none")]
pub facet_distribution: Option<BTreeMap<String, BTreeMap<String, u64>>>,
}
impl Index {
pub fn perform_search(&self, query: SearchQuery) -> Result<SearchResult> {
let before_search = Instant::now();
let rtxn = self.read_txn()?;
let mut search = self.search(&rtxn);
if let Some(ref query) = query.q {
search.query(query);
}
search.terms_matching_strategy(query.matching_strategy.into());
let max_total_hits = self
.pagination_max_total_hits(&rtxn)?
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
// Make sure that a user can't get more documents than the hard limit,
// we align that on the offset too.
let offset = min(query.offset.unwrap_or(0), max_total_hits);
let limit = min(query.limit, max_total_hits.saturating_sub(offset));
search.offset(offset);
search.limit(limit);
if let Some(ref filter) = query.filter {
if let Some(facets) = parse_filter(filter)? {
search.filter(facets);
}
}
if let Some(ref sort) = query.sort {
let sort = match sort.iter().map(|s| AscDesc::from_str(s)).collect() {
Ok(sorts) => sorts,
Err(asc_desc_error) => {
return Err(IndexError::Milli(SortError::from(asc_desc_error).into()))
}
};
search.sort_criteria(sort);
}
let milli::SearchResult {
documents_ids,
matching_words,
candidates,
..
} = search.execute()?;
let fields_ids_map = self.fields_ids_map(&rtxn).unwrap();
let displayed_ids = self
.displayed_fields_ids(&rtxn)?
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let fids = |attrs: &BTreeSet<String>| {
let mut ids = BTreeSet::new();
for attr in attrs {
if attr == "*" {
ids = displayed_ids.clone();
break;
}
if let Some(id) = fields_ids_map.id(attr) {
ids.insert(id);
}
}
ids
};
// The attributes to retrieve are the ones explicitly marked as to retrieve (all by default),
// but these attributes must be also be present
// - in the fields_ids_map
// - in the the displayed attributes
let to_retrieve_ids: BTreeSet<_> = query
.attributes_to_retrieve
.as_ref()
.map(fids)
.unwrap_or_else(|| displayed_ids.clone())
.intersection(&displayed_ids)
.cloned()
.collect();
let attr_to_highlight = query.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = query.attributes_to_crop.unwrap_or_default();
// Attributes in `formatted_options` correspond to the attributes that will be in `_formatted`
// These attributes are:
// - the attributes asked to be highlighted or cropped (with `attributesToCrop` or `attributesToHighlight`)
// - the attributes asked to be retrieved: these attributes will not be highlighted/cropped
// But these attributes must be also present in displayed attributes
let formatted_options = compute_formatted_options(
&attr_to_highlight,
&attr_to_crop,
query.crop_length,
&to_retrieve_ids,
&fields_ids_map,
&displayed_ids,
);
let tokenizer = TokenizerBuilder::default().build();
let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer);
formatter_builder.crop_marker(query.crop_marker);
formatter_builder.highlight_prefix(query.highlight_pre_tag);
formatter_builder.highlight_suffix(query.highlight_post_tag);
let mut documents = Vec::new();
let documents_iter = self.documents(&rtxn, documents_ids)?;
for (_id, obkv) in documents_iter {
// First generate a document with all the displayed fields
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
// select the attributes to retrieve
let attributes_to_retrieve = to_retrieve_ids
.iter()
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
let (matches_position, formatted) = format_fields(
&displayed_document,
&fields_ids_map,
&formatter_builder,
&formatted_options,
query.show_matches_position,
&displayed_ids,
)?;
if let Some(sort) = query.sort.as_ref() {
insert_geo_distance(sort, &mut document);
}
let hit = SearchHit {
document,
formatted,
matches_position,
};
documents.push(hit);
}
let estimated_total_hits = candidates.len();
let facet_distribution = match query.facets {
Some(ref fields) => {
let mut facet_distribution = self.facets_distribution(&rtxn);
let max_values_by_facet = self
.max_values_per_facet(&rtxn)?
.unwrap_or(DEFAULT_VALUES_PER_FACET);
facet_distribution.max_values_per_facet(max_values_by_facet);
if fields.iter().all(|f| f != "*") {
facet_distribution.facets(fields);
}
let distribution = facet_distribution.candidates(candidates).execute()?;
Some(distribution)
}
None => None,
};
let result = SearchResult {
hits: documents,
estimated_total_hits,
query: query.q.clone().unwrap_or_default(),
limit: query.limit,
offset: query.offset.unwrap_or_default(),
processing_time_ms: before_search.elapsed().as_millis(),
facet_distribution,
};
Ok(result)
}
}
fn insert_geo_distance(sorts: &[String], document: &mut Document) {
lazy_static::lazy_static! {
static ref GEO_REGEX: Regex =
Regex::new(r"_geoPoint\(\s*([[:digit:].\-]+)\s*,\s*([[:digit:].\-]+)\s*\)").unwrap();
};
if let Some(capture_group) = sorts.iter().find_map(|sort| GEO_REGEX.captures(sort)) {
// TODO: TAMO: milli encountered an internal error, what do we want to do?
let base = [
capture_group[1].parse().unwrap(),
capture_group[2].parse().unwrap(),
];
let geo_point = &document.get("_geo").unwrap_or(&json!(null));
if let Some((lat, lng)) = geo_point["lat"].as_f64().zip(geo_point["lng"].as_f64()) {
let distance = milli::distance_between_two_points(&base, &[lat, lng]);
document.insert("_geoDistance".to_string(), json!(distance.round() as usize));
}
}
}
fn compute_formatted_options(
attr_to_highlight: &HashSet<String>,
attr_to_crop: &[String],
query_crop_length: usize,
to_retrieve_ids: &BTreeSet<FieldId>,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
) -> BTreeMap<FieldId, FormatOptions> {
let mut formatted_options = BTreeMap::new();
add_highlight_to_formatted_options(
&mut formatted_options,
attr_to_highlight,
fields_ids_map,
displayed_ids,
);
add_crop_to_formatted_options(
&mut formatted_options,
attr_to_crop,
query_crop_length,
fields_ids_map,
displayed_ids,
);
// Should not return `_formatted` if no valid attributes to highlight/crop
if !formatted_options.is_empty() {
add_non_formatted_ids_to_formatted_options(&mut formatted_options, to_retrieve_ids);
}
formatted_options
}
fn add_highlight_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
attr_to_highlight: &HashSet<String>,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
) {
for attr in attr_to_highlight {
let new_format = FormatOptions {
highlight: true,
crop: None,
};
if attr == "*" {
for id in displayed_ids {
formatted_options.insert(*id, new_format);
}
break;
}
if let Some(id) = fields_ids_map.id(attr) {
if displayed_ids.contains(&id) {
formatted_options.insert(id, new_format);
}
}
}
}
fn add_crop_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
attr_to_crop: &[String],
crop_length: usize,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
) {
for attr in attr_to_crop {
let mut split = attr.rsplitn(2, ':');
let (attr_name, attr_len) = match split.next().zip(split.next()) {
Some((len, name)) => {
let crop_len = len.parse::<usize>().unwrap_or(crop_length);
(name, crop_len)
}
None => (attr.as_str(), crop_length),
};
if attr_name == "*" {
for id in displayed_ids {
formatted_options
.entry(*id)
.and_modify(|f| f.crop = Some(attr_len))
.or_insert(FormatOptions {
highlight: false,
crop: Some(attr_len),
});
}
}
if let Some(id) = fields_ids_map.id(attr_name) {
if displayed_ids.contains(&id) {
formatted_options
.entry(id)
.and_modify(|f| f.crop = Some(attr_len))
.or_insert(FormatOptions {
highlight: false,
crop: Some(attr_len),
});
}
}
}
}
fn add_non_formatted_ids_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
to_retrieve_ids: &BTreeSet<FieldId>,
) {
for id in to_retrieve_ids {
formatted_options.entry(*id).or_insert(FormatOptions {
highlight: false,
crop: None,
});
}
}
fn make_document(
displayed_attributes: &BTreeSet<FieldId>,
field_ids_map: &FieldsIdsMap,
obkv: obkv::KvReaderU16,
) -> Result<Document> {
let mut document = serde_json::Map::new();
// recreate the original json
for (key, value) in obkv.iter() {
let value = serde_json::from_slice(value)?;
let key = field_ids_map
.name(key)
.expect("Missing field name")
.to_string();
document.insert(key, value);
}
// select the attributes to retrieve
let displayed_attributes = displayed_attributes
.iter()
.map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
let document = permissive_json_pointer::select_values(&document, displayed_attributes);
Ok(document)
}
fn format_fields<'a, A: AsRef<[u8]>>(
document: &Document,
field_ids_map: &FieldsIdsMap,
builder: &MatcherBuilder<'a, A>,
formatted_options: &BTreeMap<FieldId, FormatOptions>,
compute_matches: bool,
displayable_ids: &BTreeSet<FieldId>,
) -> Result<(Option<MatchesPosition>, Document)> {
let mut matches_position = compute_matches.then(BTreeMap::new);
let mut document = document.clone();
// select the attributes to retrieve
let displayable_names = displayable_ids
.iter()
.map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
permissive_json_pointer::map_leaf_values(&mut document, displayable_names, |key, value| {
// To get the formatting option of each key we need to see all the rules that applies
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
// highlighted.
let format = formatted_options
.iter()
.filter(|(field, _option)| {
let name = field_ids_map.name(**field).unwrap();
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
})
.map(|(_, option)| *option)
.reduce(|acc, option| acc.merge(option));
let mut infos = Vec::new();
*value = format_value(
std::mem::take(value),
builder,
format,
&mut infos,
compute_matches,
);
if let Some(matches) = matches_position.as_mut() {
if !infos.is_empty() {
matches.insert(key.to_owned(), infos);
}
}
});
let selectors = formatted_options
.keys()
// This unwrap must be safe since we got the ids from the fields_ids_map just
// before.
.map(|&fid| field_ids_map.name(fid).unwrap());
let document = permissive_json_pointer::select_values(&document, selectors);
Ok((matches_position, document))
}
fn format_value<'a, A: AsRef<[u8]>>(
value: Value,
builder: &MatcherBuilder<'a, A>,
format_options: Option<FormatOptions>,
infos: &mut Vec<MatchBounds>,
compute_matches: bool,
) -> Value {
match value {
Value::String(old_string) => {
let mut matcher = builder.build(&old_string);
if compute_matches {
let matches = matcher.matches();
infos.extend_from_slice(&matches[..]);
}
match format_options {
Some(format_options) => {
let value = matcher.format(format_options);
Value::String(value.into_owned())
}
None => Value::String(old_string),
}
}
Value::Array(values) => Value::Array(
values
.into_iter()
.map(|v| {
format_value(
v,
builder,
format_options.map(|format_options| FormatOptions {
highlight: format_options.highlight,
crop: None,
}),
infos,
compute_matches,
)
})
.collect(),
),
Value::Object(object) => Value::Object(
object
.into_iter()
.map(|(k, v)| {
(
k,
format_value(
v,
builder,
format_options.map(|format_options| FormatOptions {
highlight: format_options.highlight,
crop: None,
}),
infos,
compute_matches,
),
)
})
.collect(),
),
Value::Number(number) => {
let s = number.to_string();
let mut matcher = builder.build(&s);
if compute_matches {
let matches = matcher.matches();
infos.extend_from_slice(&matches[..]);
}
match format_options {
Some(format_options) => {
let value = matcher.format(format_options);
Value::String(value.into_owned())
}
None => Value::Number(number),
}
}
value => value,
}
}
fn parse_filter(facets: &Value) -> Result<Option<Filter>> {
match facets {
Value::String(expr) => {
let condition = Filter::from_str(expr)?;
Ok(condition)
}
Value::Array(arr) => parse_filter_array(arr),
v => Err(FacetError::InvalidExpression(&["Array"], v.clone()).into()),
}
}
fn parse_filter_array(arr: &[Value]) -> Result<Option<Filter>> {
let mut ands = Vec::new();
for value in arr {
match value {
Value::String(s) => ands.push(Either::Right(s.as_str())),
Value::Array(arr) => {
let mut ors = Vec::new();
for value in arr {
match value {
Value::String(s) => ors.push(s.as_str()),
v => {
return Err(FacetError::InvalidExpression(&["String"], v.clone()).into())
}
}
}
ands.push(Either::Left(ors));
}
v => {
return Err(
FacetError::InvalidExpression(&["String", "[String]"], v.clone()).into(),
)
}
}
}
Ok(Filter::from_array(ands)?)
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_insert_geo_distance() {
let value: Document = serde_json::from_str(
r#"{
"_geo": {
"lat": 50.629973371633746,
"lng": 3.0569447399419567
},
"city": "Lille",
"id": "1"
}"#,
)
.unwrap();
let sorters = &["_geoPoint(50.629973371633746,3.0569447399419567):desc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters = &["_geoPoint(50.629973371633746, 3.0569447399419567):asc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters =
&["_geoPoint( 50.629973371633746 , 3.0569447399419567 ):desc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters = &[
"prix:asc",
"villeneuve:desc",
"_geoPoint(50.629973371633746, 3.0569447399419567):asc",
"ubu:asc",
]
.map(|s| s.to_string());
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
// only the first geoPoint is used to compute the distance
let sorters = &[
"chien:desc",
"_geoPoint(50.629973371633746, 3.0569447399419567):asc",
"pangolin:desc",
"_geoPoint(100.0, -80.0):asc",
"chat:asc",
]
.map(|s| s.to_string());
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
// there was no _geoPoint so nothing is inserted in the document
let sorters = &["chien:asc".to_string()];
let mut document = value;
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), None);
}
}

View File

@@ -1,559 +0,0 @@
use std::collections::{BTreeMap, BTreeSet};
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use log::{debug, info, trace};
use milli::documents::DocumentsBatchReader;
use milli::update::{
DocumentAdditionResult, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
Setting,
};
use serde::{Deserialize, Serialize, Serializer};
use uuid::Uuid;
use super::error::{IndexError, Result};
use super::index::{Index, IndexMeta};
use crate::update_file_store::UpdateFileStore;
fn serialize_with_wildcard<S>(
field: &Setting<Vec<String>>,
s: S,
) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let wildcard = vec!["*".to_string()];
match field {
Setting::Set(value) => Some(value),
Setting::Reset => Some(&wildcard),
Setting::NotSet => None,
}
.serialize(s)
}
#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)]
pub struct Checked;
#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Unchecked;
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct MinWordSizeTyposSetting {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub one_typo: Setting<u8>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub two_typos: Setting<u8>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct TypoSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub enabled: Setting<bool>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub min_word_size_for_typos: Setting<MinWordSizeTyposSetting>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_words: Setting<BTreeSet<String>>,
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub disable_on_attributes: Setting<BTreeSet<String>>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct FacetingSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub max_values_per_facet: Setting<usize>,
}
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct PaginationSettings {
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
pub max_total_hits: Setting<usize>,
}
/// Holds all the settings for an index. `T` can either be `Checked` if they represents settings
/// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a
/// call to `check` will return a `Settings<Checked>` from a `Settings<Unchecked>`.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
#[serde(bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>"))]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub struct Settings<T> {
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub displayed_attributes: Setting<Vec<String>>,
#[serde(
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub searchable_attributes: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub filterable_attributes: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub sortable_attributes: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub ranking_rules: Setting<Vec<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub stop_words: Setting<BTreeSet<String>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub synonyms: Setting<BTreeMap<String, Vec<String>>>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub distinct_attribute: Setting<String>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub typo_tolerance: Setting<TypoSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub faceting: Setting<FacetingSettings>,
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[cfg_attr(test, proptest(strategy = "test::setting_strategy()"))]
pub pagination: Setting<PaginationSettings>,
#[serde(skip)]
pub _kind: PhantomData<T>,
}
impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> {
Settings {
displayed_attributes: Setting::Reset,
searchable_attributes: Setting::Reset,
filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset,
ranking_rules: Setting::Reset,
stop_words: Setting::Reset,
synonyms: Setting::Reset,
distinct_attribute: Setting::Reset,
typo_tolerance: Setting::Reset,
faceting: Setting::Reset,
pagination: Setting::Reset,
_kind: PhantomData,
}
}
pub fn into_unchecked(self) -> Settings<Unchecked> {
let Self {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
typo_tolerance,
faceting,
pagination,
..
} = self;
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes,
sortable_attributes,
ranking_rules,
stop_words,
synonyms,
distinct_attribute,
typo_tolerance,
faceting,
pagination,
_kind: PhantomData,
}
}
}
impl Settings<Unchecked> {
pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
} else {
Setting::Set(fields)
}
}
otherwise => otherwise,
};
let searchable_attributes = match self.searchable_attributes {
Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") {
Setting::Reset
} else {
Setting::Set(fields)
}
}
otherwise => otherwise,
};
Settings {
displayed_attributes,
searchable_attributes,
filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes,
ranking_rules: self.ranking_rules,
stop_words: self.stop_words,
synonyms: self.synonyms,
distinct_attribute: self.distinct_attribute,
typo_tolerance: self.typo_tolerance,
faceting: self.faceting,
pagination: self.pagination,
_kind: PhantomData,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]
pub struct Facets {
pub level_group_size: Option<NonZeroUsize>,
pub min_level_size: Option<NonZeroUsize>,
}
impl Index {
fn update_primary_key_txn<'a, 'b>(
&'a self,
txn: &mut milli::heed::RwTxn<'a, 'b>,
primary_key: String,
) -> Result<IndexMeta> {
let mut builder = milli::update::Settings::new(txn, self, self.indexer_config.as_ref());
builder.set_primary_key(primary_key);
builder.execute(|_| ())?;
let meta = IndexMeta::new_txn(self, txn)?;
Ok(meta)
}
pub fn update_primary_key(&self, primary_key: String) -> Result<IndexMeta> {
let mut txn = self.write_txn()?;
let res = self.update_primary_key_txn(&mut txn, primary_key)?;
txn.commit()?;
Ok(res)
}
/// Deletes `ids` from the index, and returns how many documents were deleted.
pub fn delete_documents(&self, ids: &[String]) -> Result<DocumentDeletionResult> {
let mut txn = self.write_txn()?;
let mut builder = milli::update::DeleteDocuments::new(&mut txn, self)?;
// We ignore unexisting document ids
ids.iter().for_each(|id| {
builder.delete_external_id(id);
});
let deleted = builder.execute()?;
txn.commit()?;
Ok(deleted)
}
pub fn clear_documents(&self) -> Result<()> {
let mut txn = self.write_txn()?;
milli::update::ClearDocuments::new(&mut txn, self).execute()?;
txn.commit()?;
Ok(())
}
pub fn update_documents(
&self,
method: IndexDocumentsMethod,
primary_key: Option<String>,
file_store: UpdateFileStore,
contents: impl IntoIterator<Item = Uuid>,
) -> Result<Vec<Result<DocumentAdditionResult>>> {
trace!("performing document addition");
let mut txn = self.write_txn()?;
if let Some(primary_key) = primary_key {
if self.primary_key(&txn)?.is_none() {
self.update_primary_key_txn(&mut txn, primary_key)?;
}
}
let config = IndexDocumentsConfig {
update_method: method,
..Default::default()
};
let indexing_callback = |indexing_step| debug!("update: {:?}", indexing_step);
let mut builder = milli::update::IndexDocuments::new(
&mut txn,
self,
self.indexer_config.as_ref(),
config,
indexing_callback,
)?;
let mut results = Vec::new();
for content_uuid in contents.into_iter() {
let content_file = file_store.get_update(content_uuid)?;
let reader = DocumentsBatchReader::from_reader(content_file)?;
let (new_builder, user_result) = builder.add_documents(reader)?;
builder = new_builder;
let user_result = match user_result {
Ok(count) => Ok(DocumentAdditionResult {
indexed_documents: count,
number_of_documents: count,
}),
Err(e) => Err(IndexError::from(e)),
};
results.push(user_result);
}
if results.iter().any(Result::is_ok) {
let addition = builder.execute()?;
txn.commit()?;
info!("document addition done: {:?}", addition);
}
Ok(results)
}
pub fn update_settings(&self, settings: &Settings<Checked>) -> Result<()> {
// We must use the write transaction of the update here.
let mut txn = self.write_txn()?;
let mut builder =
milli::update::Settings::new(&mut txn, self, self.indexer_config.as_ref());
apply_settings_to_builder(settings, &mut builder);
builder.execute(|indexing_step| debug!("update: {:?}", indexing_step))?;
txn.commit()?;
Ok(())
}
}
pub fn apply_settings_to_builder(
settings: &Settings<Checked>,
builder: &mut milli::update::Settings,
) {
match settings.searchable_attributes {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (),
}
match settings.displayed_attributes {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (),
}
match settings.filterable_attributes {
Setting::Set(ref facets) => {
builder.set_filterable_fields(facets.clone().into_iter().collect())
}
Setting::Reset => builder.reset_filterable_fields(),
Setting::NotSet => (),
}
match settings.sortable_attributes {
Setting::Set(ref fields) => builder.set_sortable_fields(fields.iter().cloned().collect()),
Setting::Reset => builder.reset_sortable_fields(),
Setting::NotSet => (),
}
match settings.ranking_rules {
Setting::Set(ref criteria) => builder.set_criteria(criteria.clone()),
Setting::Reset => builder.reset_criteria(),
Setting::NotSet => (),
}
match settings.stop_words {
Setting::Set(ref stop_words) => builder.set_stop_words(stop_words.clone()),
Setting::Reset => builder.reset_stop_words(),
Setting::NotSet => (),
}
match settings.synonyms {
Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
Setting::Reset => builder.reset_synonyms(),
Setting::NotSet => (),
}
match settings.distinct_attribute {
Setting::Set(ref attr) => builder.set_distinct_field(attr.clone()),
Setting::Reset => builder.reset_distinct_field(),
Setting::NotSet => (),
}
match settings.typo_tolerance {
Setting::Set(ref value) => {
match value.enabled {
Setting::Set(val) => builder.set_autorize_typos(val),
Setting::Reset => builder.reset_authorize_typos(),
Setting::NotSet => (),
}
match value.min_word_size_for_typos {
Setting::Set(ref setting) => {
match setting.one_typo {
Setting::Set(val) => builder.set_min_word_len_one_typo(val),
Setting::Reset => builder.reset_min_word_len_one_typo(),
Setting::NotSet => (),
}
match setting.two_typos {
Setting::Set(val) => builder.set_min_word_len_two_typos(val),
Setting::Reset => builder.reset_min_word_len_two_typos(),
Setting::NotSet => (),
}
}
Setting::Reset => {
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
}
Setting::NotSet => (),
}
match value.disable_on_words {
Setting::Set(ref words) => {
builder.set_exact_words(words.clone());
}
Setting::Reset => builder.reset_exact_words(),
Setting::NotSet => (),
}
match value.disable_on_attributes {
Setting::Set(ref words) => {
builder.set_exact_attributes(words.iter().cloned().collect())
}
Setting::Reset => builder.reset_exact_attributes(),
Setting::NotSet => (),
}
}
Setting::Reset => {
// all typo settings need to be reset here.
builder.reset_authorize_typos();
builder.reset_min_word_len_one_typo();
builder.reset_min_word_len_two_typos();
builder.reset_exact_words();
builder.reset_exact_attributes();
}
Setting::NotSet => (),
}
match settings.faceting {
Setting::Set(ref value) => match value.max_values_per_facet {
Setting::Set(val) => builder.set_max_values_per_facet(val),
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
},
Setting::Reset => builder.reset_max_values_per_facet(),
Setting::NotSet => (),
}
match settings.pagination {
Setting::Set(ref value) => match value.max_total_hits {
Setting::Set(val) => builder.set_pagination_max_total_hits(val),
Setting::Reset => builder.reset_pagination_max_total_hits(),
Setting::NotSet => (),
},
Setting::Reset => builder.reset_pagination_max_total_hits(),
Setting::NotSet => (),
}
}
#[cfg(test)]
pub(crate) mod test {
use proptest::prelude::*;
use super::*;
pub(super) fn setting_strategy<T: Arbitrary + Clone>() -> impl Strategy<Value = Setting<T>> {
prop_oneof![
Just(Setting::NotSet),
Just(Setting::Reset),
any::<T>().prop_map(Setting::Set)
]
}
#[test]
fn test_setting_check() {
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("hello")]),
searchable_attributes: Setting::Set(vec![String::from("hello")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.clone().check();
assert_eq!(settings.displayed_attributes, checked.displayed_attributes);
assert_eq!(
settings.searchable_attributes,
checked.searchable_attributes
);
// test wildcard
// test no changes
let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("*")]),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]),
filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet,
stop_words: Setting::NotSet,
synonyms: Setting::NotSet,
distinct_attribute: Setting::NotSet,
typo_tolerance: Setting::NotSet,
faceting: Setting::NotSet,
pagination: Setting::NotSet,
_kind: PhantomData::<Unchecked>,
};
let checked = settings.check();
assert_eq!(checked.displayed_attributes, Setting::Reset);
assert_eq!(checked.searchable_attributes, Setting::Reset);
}
}

View File

@@ -1,72 +0,0 @@
use std::error::Error;
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::index_uid::IndexUidFormatError;
use meilisearch_types::internal_error;
use tokio::task::JoinError;
use super::DocumentAdditionFormat;
use crate::document_formats::DocumentFormatError;
use crate::dump::error::DumpError;
use crate::index::error::IndexError;
use crate::tasks::error::TaskError;
use crate::update_file_store::UpdateFileStoreError;
use crate::index_resolver::error::IndexResolverError;
pub type Result<T> = std::result::Result<T, IndexControllerError>;
#[derive(Debug, thiserror::Error)]
pub enum IndexControllerError {
#[error("Index creation must have an uid")]
MissingUid,
#[error("{0}")]
IndexResolver(#[from] IndexResolverError),
#[error("{0}")]
IndexError(#[from] IndexError),
#[error("An internal error has occurred. `{0}`.")]
Internal(Box<dyn Error + Send + Sync + 'static>),
#[error("{0}")]
TaskError(#[from] TaskError),
#[error("{0}")]
DumpError(#[from] DumpError),
#[error("{0}")]
DocumentFormatError(#[from] DocumentFormatError),
#[error("A {0} payload is missing.")]
MissingPayload(DocumentAdditionFormat),
#[error("The provided payload reached the size limit.")]
PayloadTooLarge,
}
internal_error!(IndexControllerError: JoinError, UpdateFileStoreError);
impl From<actix_web::error::PayloadError> for IndexControllerError {
fn from(other: actix_web::error::PayloadError) -> Self {
match other {
actix_web::error::PayloadError::Overflow => Self::PayloadTooLarge,
_ => Self::Internal(Box::new(other)),
}
}
}
impl ErrorCode for IndexControllerError {
fn error_code(&self) -> Code {
match self {
IndexControllerError::MissingUid => Code::BadRequest,
IndexControllerError::IndexResolver(e) => e.error_code(),
IndexControllerError::IndexError(e) => e.error_code(),
IndexControllerError::Internal(_) => Code::Internal,
IndexControllerError::TaskError(e) => e.error_code(),
IndexControllerError::DocumentFormatError(e) => e.error_code(),
IndexControllerError::MissingPayload(_) => Code::MissingPayload,
IndexControllerError::PayloadTooLarge => Code::PayloadTooLarge,
IndexControllerError::DumpError(e) => e.error_code(),
}
}
}
impl From<IndexUidFormatError> for IndexControllerError {
fn from(err: IndexUidFormatError) -> Self {
IndexResolverError::from(err).into()
}
}

View File

@@ -1,779 +0,0 @@
use meilisearch_auth::SearchRules;
use std::collections::BTreeMap;
use std::fmt;
use std::io::Cursor;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use actix_web::error::PayloadError;
use bytes::Bytes;
use futures::Stream;
use futures::StreamExt;
use meilisearch_types::index_uid::IndexUid;
use milli::update::IndexDocumentsMethod;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use tokio::sync::RwLock;
use tokio::task::spawn_blocking;
use tokio::time::sleep;
use uuid::Uuid;
use crate::document_formats::{read_csv, read_json, read_ndjson};
use crate::dump::{self, load_dump, DumpHandler};
use crate::index::{
Checked, Document, IndexMeta, IndexStats, SearchQuery, SearchResult, Settings, Unchecked,
};
use crate::index_resolver::error::IndexResolverError;
use crate::options::{IndexerOpts, SchedulerConfig};
use crate::snapshot::{load_snapshot, SnapshotService};
use crate::tasks::error::TaskError;
use crate::tasks::task::{DocumentDeletion, Task, TaskContent, TaskId};
use crate::tasks::{
BatchHandler, EmptyBatchHandler, Scheduler, SnapshotHandler, TaskFilter, TaskStore,
};
use error::Result;
use self::error::IndexControllerError;
use crate::index_resolver::index_store::{IndexStore, MapIndexStore};
use crate::index_resolver::meta_store::{HeedMetaStore, IndexMetaStore};
use crate::index_resolver::{create_index_resolver, IndexResolver};
use crate::update_file_store::UpdateFileStore;
pub mod error;
pub mod versioning;
/// Concrete implementation of the IndexController, exposed by meilisearch-lib
pub type MeiliSearch = IndexController<HeedMetaStore, MapIndexStore>;
pub type Payload = Box<
dyn Stream<Item = std::result::Result<Bytes, PayloadError>> + Send + Sync + 'static + Unpin,
>;
pub fn open_meta_env(path: &Path, size: usize) -> milli::heed::Result<milli::heed::Env> {
let mut options = milli::heed::EnvOpenOptions::new();
options.map_size(size);
options.max_dbs(20);
options.open(path)
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct IndexMetadata {
#[serde(skip)]
pub uuid: Uuid,
pub uid: String,
#[serde(flatten)]
pub meta: IndexMeta,
}
#[derive(Clone, Debug)]
pub struct IndexSettings {
pub uid: Option<String>,
pub primary_key: Option<String>,
}
pub struct IndexController<U, I> {
pub index_resolver: Arc<IndexResolver<U, I>>,
scheduler: Arc<RwLock<Scheduler>>,
task_store: TaskStore,
pub update_file_store: UpdateFileStore,
}
/// Need a custom implementation for clone because deriving require that U and I are clone.
impl<U, I> Clone for IndexController<U, I> {
fn clone(&self) -> Self {
Self {
index_resolver: self.index_resolver.clone(),
scheduler: self.scheduler.clone(),
update_file_store: self.update_file_store.clone(),
task_store: self.task_store.clone(),
}
}
}
#[derive(Debug)]
pub enum DocumentAdditionFormat {
Json,
Csv,
Ndjson,
}
impl fmt::Display for DocumentAdditionFormat {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DocumentAdditionFormat::Json => write!(f, "json"),
DocumentAdditionFormat::Ndjson => write!(f, "ndjson"),
DocumentAdditionFormat::Csv => write!(f, "csv"),
}
}
}
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct Stats {
pub database_size: u64,
#[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
pub last_update: Option<OffsetDateTime>,
pub indexes: BTreeMap<String, IndexStats>,
}
#[allow(clippy::large_enum_variant)]
#[derive(derivative::Derivative)]
#[derivative(Debug)]
pub enum Update {
DeleteDocuments(Vec<String>),
ClearDocuments,
Settings {
settings: Settings<Unchecked>,
/// Indicates whether the update was a deletion
is_deletion: bool,
allow_index_creation: bool,
},
DocumentAddition {
#[derivative(Debug = "ignore")]
payload: Payload,
primary_key: Option<String>,
method: IndexDocumentsMethod,
format: DocumentAdditionFormat,
allow_index_creation: bool,
},
DeleteIndex,
CreateIndex {
primary_key: Option<String>,
},
UpdateIndex {
primary_key: Option<String>,
},
}
#[derive(Default, Debug)]
pub struct IndexControllerBuilder {
max_index_size: Option<usize>,
max_task_store_size: Option<usize>,
snapshot_dir: Option<PathBuf>,
import_snapshot: Option<PathBuf>,
snapshot_interval: Option<Duration>,
ignore_snapshot_if_db_exists: bool,
ignore_missing_snapshot: bool,
schedule_snapshot: bool,
dump_src: Option<PathBuf>,
dump_dst: Option<PathBuf>,
ignore_dump_if_db_exists: bool,
ignore_missing_dump: bool,
}
impl IndexControllerBuilder {
pub fn build(
self,
db_path: impl AsRef<Path>,
indexer_options: IndexerOpts,
scheduler_config: SchedulerConfig,
) -> anyhow::Result<MeiliSearch> {
let index_size = self
.max_index_size
.ok_or_else(|| anyhow::anyhow!("Missing index size"))?;
let task_store_size = self
.max_task_store_size
.ok_or_else(|| anyhow::anyhow!("Missing update database size"))?;
if let Some(ref path) = self.import_snapshot {
log::info!("Loading from snapshot {:?}", path);
load_snapshot(
db_path.as_ref(),
path,
self.ignore_snapshot_if_db_exists,
self.ignore_missing_snapshot,
)?;
} else if let Some(ref src_path) = self.dump_src {
load_dump(
db_path.as_ref(),
src_path,
self.ignore_dump_if_db_exists,
self.ignore_missing_dump,
index_size,
task_store_size,
&indexer_options,
)?;
} else if db_path.as_ref().exists() {
// Directory could be pre-created without any database in.
let db_is_empty = db_path.as_ref().read_dir()?.next().is_none();
if !db_is_empty {
versioning::check_version_file(db_path.as_ref())?;
}
}
std::fs::create_dir_all(db_path.as_ref())?;
let meta_env = Arc::new(open_meta_env(db_path.as_ref(), task_store_size)?);
let update_file_store = UpdateFileStore::new(&db_path)?;
// Create or overwrite the version file for this DB
versioning::create_version_file(db_path.as_ref())?;
let index_resolver = Arc::new(create_index_resolver(
&db_path,
index_size,
&indexer_options,
meta_env.clone(),
update_file_store.clone(),
)?);
let dump_path = self
.dump_dst
.ok_or_else(|| anyhow::anyhow!("Missing dump directory path"))?;
let dump_handler = Arc::new(DumpHandler::new(
dump_path,
db_path.as_ref().into(),
update_file_store.clone(),
task_store_size,
index_size,
meta_env.clone(),
index_resolver.clone(),
));
let task_store = TaskStore::new(meta_env)?;
// register all the batch handlers for use with the scheduler.
let handlers: Vec<Arc<dyn BatchHandler + Sync + Send + 'static>> = vec![
index_resolver.clone(),
dump_handler,
Arc::new(SnapshotHandler),
// dummy handler to catch all empty batches
Arc::new(EmptyBatchHandler),
];
let scheduler = Scheduler::new(task_store.clone(), handlers, scheduler_config)?;
if self.schedule_snapshot {
let snapshot_period = self
.snapshot_interval
.ok_or_else(|| anyhow::anyhow!("Snapshot interval not provided."))?;
let snapshot_path = self
.snapshot_dir
.ok_or_else(|| anyhow::anyhow!("Snapshot path not provided."))?;
let snapshot_service = SnapshotService {
db_path: db_path.as_ref().to_path_buf(),
snapshot_period,
snapshot_path,
index_size,
meta_env_size: task_store_size,
scheduler: scheduler.clone(),
};
tokio::task::spawn_local(snapshot_service.run());
}
Ok(IndexController {
index_resolver,
scheduler,
update_file_store,
task_store,
})
}
/// Set the index controller builder's max update store size.
pub fn set_max_task_store_size(&mut self, max_update_store_size: usize) -> &mut Self {
let max_update_store_size = clamp_to_page_size(max_update_store_size);
self.max_task_store_size.replace(max_update_store_size);
self
}
pub fn set_max_index_size(&mut self, size: usize) -> &mut Self {
let size = clamp_to_page_size(size);
self.max_index_size.replace(size);
self
}
/// Set the index controller builder's snapshot path.
pub fn set_snapshot_dir(&mut self, snapshot_dir: PathBuf) -> &mut Self {
self.snapshot_dir.replace(snapshot_dir);
self
}
/// Set the index controller builder's ignore snapshot if db exists.
pub fn set_ignore_snapshot_if_db_exists(
&mut self,
ignore_snapshot_if_db_exists: bool,
) -> &mut Self {
self.ignore_snapshot_if_db_exists = ignore_snapshot_if_db_exists;
self
}
/// Set the index controller builder's ignore missing snapshot.
pub fn set_ignore_missing_snapshot(&mut self, ignore_missing_snapshot: bool) -> &mut Self {
self.ignore_missing_snapshot = ignore_missing_snapshot;
self
}
/// Set the index controller builder's import snapshot.
pub fn set_import_snapshot(&mut self, import_snapshot: PathBuf) -> &mut Self {
self.import_snapshot.replace(import_snapshot);
self
}
/// Set the index controller builder's snapshot interval sec.
pub fn set_snapshot_interval(&mut self, snapshot_interval: Duration) -> &mut Self {
self.snapshot_interval = Some(snapshot_interval);
self
}
/// Set the index controller builder's schedule snapshot.
pub fn set_schedule_snapshot(&mut self) -> &mut Self {
self.schedule_snapshot = true;
self
}
/// Set the index controller builder's dump src.
pub fn set_dump_src(&mut self, dump_src: PathBuf) -> &mut Self {
self.dump_src.replace(dump_src);
self
}
/// Set the index controller builder's dump dst.
pub fn set_dump_dst(&mut self, dump_dst: PathBuf) -> &mut Self {
self.dump_dst.replace(dump_dst);
self
}
/// Set the index controller builder's ignore dump if db exists.
pub fn set_ignore_dump_if_db_exists(&mut self, ignore_dump_if_db_exists: bool) -> &mut Self {
self.ignore_dump_if_db_exists = ignore_dump_if_db_exists;
self
}
/// Set the index controller builder's ignore missing dump.
pub fn set_ignore_missing_dump(&mut self, ignore_missing_dump: bool) -> &mut Self {
self.ignore_missing_dump = ignore_missing_dump;
self
}
}
impl<U, I> IndexController<U, I>
where
U: IndexMetaStore,
I: IndexStore,
{
pub fn builder() -> IndexControllerBuilder {
IndexControllerBuilder::default()
}
pub async fn register_update(&self, uid: String, update: Update) -> Result<Task> {
let index_uid = IndexUid::from_str(&uid).map_err(IndexResolverError::from)?;
let content = match update {
Update::DeleteDocuments(ids) => TaskContent::DocumentDeletion {
index_uid,
deletion: DocumentDeletion::Ids(ids),
},
Update::ClearDocuments => TaskContent::DocumentDeletion {
index_uid,
deletion: DocumentDeletion::Clear,
},
Update::Settings {
settings,
is_deletion,
allow_index_creation,
} => TaskContent::SettingsUpdate {
settings,
is_deletion,
allow_index_creation,
index_uid,
},
Update::DocumentAddition {
mut payload,
primary_key,
format,
method,
allow_index_creation,
} => {
let mut buffer = Vec::new();
while let Some(bytes) = payload.next().await {
let bytes = bytes?;
buffer.extend_from_slice(&bytes);
}
let (content_uuid, mut update_file) = self.update_file_store.new_update()?;
let documents_count = tokio::task::spawn_blocking(move || -> Result<_> {
// check if the payload is empty, and return an error
if buffer.is_empty() {
return Err(IndexControllerError::MissingPayload(format));
}
let reader = Cursor::new(buffer);
let count = match format {
DocumentAdditionFormat::Json => read_json(reader, &mut *update_file)?,
DocumentAdditionFormat::Csv => read_csv(reader, &mut *update_file)?,
DocumentAdditionFormat::Ndjson => read_ndjson(reader, &mut *update_file)?,
};
update_file.persist()?;
Ok(count)
})
.await??;
TaskContent::DocumentAddition {
content_uuid,
merge_strategy: method,
primary_key,
documents_count,
allow_index_creation,
index_uid,
}
}
Update::DeleteIndex => TaskContent::IndexDeletion { index_uid },
Update::CreateIndex { primary_key } => TaskContent::IndexCreation {
primary_key,
index_uid,
},
Update::UpdateIndex { primary_key } => TaskContent::IndexUpdate {
primary_key,
index_uid,
},
};
let task = self.task_store.register(content).await?;
self.scheduler.read().await.notify();
Ok(task)
}
pub async fn register_dump_task(&self) -> Result<Task> {
let uid = dump::generate_uid();
let content = TaskContent::Dump { uid };
let task = self.task_store.register(content).await?;
self.scheduler.read().await.notify();
Ok(task)
}
pub async fn get_task(&self, id: TaskId, filter: Option<TaskFilter>) -> Result<Task> {
let task = self.scheduler.read().await.get_task(id, filter).await?;
Ok(task)
}
pub async fn get_index_task(&self, index_uid: String, task_id: TaskId) -> Result<Task> {
let creation_task_id = self
.index_resolver
.get_index_creation_task_id(index_uid.clone())
.await?;
if task_id < creation_task_id {
return Err(TaskError::UnexistingTask(task_id).into());
}
let mut filter = TaskFilter::default();
filter.filter_index(index_uid);
let task = self
.scheduler
.read()
.await
.get_task(task_id, Some(filter))
.await?;
Ok(task)
}
pub async fn list_tasks(
&self,
filter: Option<TaskFilter>,
limit: Option<usize>,
offset: Option<TaskId>,
) -> Result<Vec<Task>> {
let tasks = self
.scheduler
.read()
.await
.list_tasks(offset, filter, limit)
.await?;
Ok(tasks)
}
pub async fn list_index_task(
&self,
index_uid: String,
limit: Option<usize>,
offset: Option<TaskId>,
) -> Result<Vec<Task>> {
let task_id = self
.index_resolver
.get_index_creation_task_id(index_uid.clone())
.await?;
let mut filter = TaskFilter::default();
filter.filter_index(index_uid);
let tasks = self
.scheduler
.read()
.await
.list_tasks(
Some(offset.unwrap_or_default() + task_id),
Some(filter),
limit,
)
.await?;
Ok(tasks)
}
pub async fn list_indexes(&self) -> Result<Vec<IndexMetadata>> {
let indexes = self.index_resolver.list().await?;
let mut ret = Vec::new();
for (uid, index) in indexes {
let meta = index.meta()?;
let meta = IndexMetadata {
uuid: index.uuid(),
uid,
meta,
};
ret.push(meta);
}
Ok(ret)
}
pub async fn settings(&self, uid: String) -> Result<Settings<Checked>> {
let index = self.index_resolver.get_index(uid).await?;
let settings = spawn_blocking(move || index.settings()).await??;
Ok(settings)
}
/// Return the total number of documents contained in the index + the selected documents.
pub async fn documents(
&self,
uid: String,
offset: usize,
limit: usize,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<(u64, Vec<Document>)> {
let index = self.index_resolver.get_index(uid).await?;
let result =
spawn_blocking(move || index.retrieve_documents(offset, limit, attributes_to_retrieve))
.await??;
Ok(result)
}
pub async fn document(
&self,
uid: String,
doc_id: String,
attributes_to_retrieve: Option<Vec<String>>,
) -> Result<Document> {
let index = self.index_resolver.get_index(uid).await?;
let document =
spawn_blocking(move || index.retrieve_document(doc_id, attributes_to_retrieve))
.await??;
Ok(document)
}
pub async fn search(&self, uid: String, query: SearchQuery) -> Result<SearchResult> {
let index = self.index_resolver.get_index(uid).await?;
let result = spawn_blocking(move || index.perform_search(query)).await??;
Ok(result)
}
pub async fn get_index(&self, uid: String) -> Result<IndexMetadata> {
let index = self.index_resolver.get_index(uid.clone()).await?;
let uuid = index.uuid();
let meta = spawn_blocking(move || index.meta()).await??;
let meta = IndexMetadata { uuid, uid, meta };
Ok(meta)
}
pub async fn get_index_stats(&self, uid: String) -> Result<IndexStats> {
let processing_tasks = self.scheduler.read().await.get_processing_tasks().await?;
// Check if the currently indexing update is from our index.
let is_indexing = processing_tasks
.first()
.map_or(false, |task| task.index_uid().map_or(false, |u| u == uid));
let index = self.index_resolver.get_index(uid).await?;
let mut stats = spawn_blocking(move || index.stats()).await??;
stats.is_indexing = Some(is_indexing);
Ok(stats)
}
pub async fn get_all_stats(&self, search_rules: &SearchRules) -> Result<Stats> {
let mut last_task: Option<OffsetDateTime> = None;
let mut indexes = BTreeMap::new();
let mut database_size = 0;
let processing_tasks = self.scheduler.read().await.get_processing_tasks().await?;
for (index_uid, index) in self.index_resolver.list().await? {
if !search_rules.is_index_authorized(&index_uid) {
continue;
}
let (mut stats, meta) =
spawn_blocking::<_, Result<(IndexStats, IndexMeta)>>(move || {
Ok((index.stats()?, index.meta()?))
})
.await??;
database_size += stats.size;
last_task = last_task.map_or(Some(meta.updated_at), |last| {
Some(last.max(meta.updated_at))
});
// Check if the currently indexing update is from our index.
stats.is_indexing = processing_tasks
.first()
.and_then(|p| p.index_uid().map(|u| u == index_uid))
.or(Some(false));
indexes.insert(index_uid, stats);
}
Ok(Stats {
database_size,
last_update: last_task,
indexes,
})
}
}
pub async fn get_arc_ownership_blocking<T>(mut item: Arc<T>) -> T {
loop {
match Arc::try_unwrap(item) {
Ok(item) => return item,
Err(item_arc) => {
item = item_arc;
sleep(Duration::from_millis(100)).await;
continue;
}
}
}
}
// Clamp the provided value to be a multiple of system page size.
fn clamp_to_page_size(size: usize) -> usize {
size / page_size::get() * page_size::get()
}
#[cfg(test)]
mod test {
use futures::future::ok;
use mockall::predicate::eq;
use nelson::Mocker;
use crate::index::error::Result as IndexResult;
use crate::index::Index;
use crate::index::{
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
};
use crate::index_resolver::index_store::MockIndexStore;
use crate::index_resolver::meta_store::MockIndexMetaStore;
use crate::index_resolver::IndexResolver;
use super::*;
impl IndexController<MockIndexMetaStore, MockIndexStore> {
pub fn mock(
index_resolver: Arc<IndexResolver<MockIndexMetaStore, MockIndexStore>>,
task_store: TaskStore,
update_file_store: UpdateFileStore,
scheduler: Arc<RwLock<Scheduler>>,
) -> Self {
IndexController {
index_resolver,
task_store,
update_file_store,
scheduler,
}
}
}
#[actix_rt::test]
async fn test_search_simple() {
let index_uid = "test";
let index_uuid = Uuid::new_v4();
let query = SearchQuery {
q: Some(String::from("hello world")),
offset: Some(10),
limit: 0,
attributes_to_retrieve: Some(vec!["string".to_owned()].into_iter().collect()),
attributes_to_crop: None,
crop_length: 18,
attributes_to_highlight: None,
show_matches_position: true,
filter: None,
sort: None,
facets: None,
highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(),
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),
crop_marker: DEFAULT_CROP_MARKER(),
matching_strategy: Default::default(),
};
let result = SearchResult {
hits: vec![],
estimated_total_hits: 29,
query: "hello world".to_string(),
limit: 24,
offset: 0,
processing_time_ms: 50,
facet_distribution: None,
};
let mut uuid_store = MockIndexMetaStore::new();
uuid_store
.expect_get()
.with(eq(index_uid.to_owned()))
.returning(move |s| {
Box::pin(ok((
s,
Some(crate::index_resolver::meta_store::IndexMeta {
uuid: index_uuid,
creation_task_id: 0,
}),
)))
});
let mut index_store = MockIndexStore::new();
let result_clone = result.clone();
let query_clone = query.clone();
index_store
.expect_get()
.with(eq(index_uuid))
.returning(move |_uuid| {
let result = result_clone.clone();
let query = query_clone.clone();
let mocker = Mocker::default();
mocker
.when::<SearchQuery, IndexResult<SearchResult>>("perform_search")
.once()
.then(move |q| {
assert_eq!(&q, &query);
Ok(result.clone())
});
let index = Index::mock(mocker);
Box::pin(ok(Some(index)))
});
let task_store_mocker = nelson::Mocker::default();
let mocker = Mocker::default();
let update_file_store = UpdateFileStore::mock(mocker);
let index_resolver = Arc::new(IndexResolver::new(
uuid_store,
index_store,
update_file_store.clone(),
));
let task_store = TaskStore::mock(task_store_mocker);
let scheduler = Scheduler::new(
task_store.clone(),
vec![index_resolver.clone()],
SchedulerConfig::default(),
)
.unwrap();
let index_controller =
IndexController::mock(index_resolver, task_store, update_file_store, scheduler);
let r = index_controller
.search(index_uid.to_owned(), query.clone())
.await
.unwrap();
assert_eq!(r, result);
}
}

View File

@@ -1,79 +0,0 @@
use std::error::Error;
use std::fmt;
use meilisearch_types::{internal_error, Code, ErrorCode};
use crate::{
document_formats::DocumentFormatError,
index::error::IndexError,
index_controller::{update_file_store::UpdateFileStoreError, DocumentAdditionFormat},
};
pub type Result<T> = std::result::Result<T, UpdateLoopError>;
#[derive(Debug, thiserror::Error)]
#[allow(clippy::large_enum_variant)]
pub enum UpdateLoopError {
#[error("Task `{0}` not found.")]
UnexistingUpdate(u64),
#[error("An internal error has occurred. `{0}`.")]
Internal(Box<dyn Error + Send + Sync + 'static>),
#[error(
"update store was shut down due to a fatal error, please check your logs for more info."
)]
FatalUpdateStoreError,
#[error("{0}")]
DocumentFormatError(#[from] DocumentFormatError),
#[error("The provided payload reached the size limit.")]
PayloadTooLarge,
#[error("A {0} payload is missing.")]
MissingPayload(DocumentAdditionFormat),
#[error("{0}")]
IndexError(#[from] IndexError),
}
impl<T> From<tokio::sync::mpsc::error::SendError<T>> for UpdateLoopError
where
T: Sync + Send + 'static + fmt::Debug,
{
fn from(other: tokio::sync::mpsc::error::SendError<T>) -> Self {
Self::Internal(Box::new(other))
}
}
impl From<tokio::sync::oneshot::error::RecvError> for UpdateLoopError {
fn from(other: tokio::sync::oneshot::error::RecvError) -> Self {
Self::Internal(Box::new(other))
}
}
impl From<actix_web::error::PayloadError> for UpdateLoopError {
fn from(other: actix_web::error::PayloadError) -> Self {
match other {
actix_web::error::PayloadError::Overflow => Self::PayloadTooLarge,
_ => Self::Internal(Box::new(other)),
}
}
}
internal_error!(
UpdateLoopError: heed::Error,
std::io::Error,
serde_json::Error,
tokio::task::JoinError,
UpdateFileStoreError
);
impl ErrorCode for UpdateLoopError {
fn error_code(&self) -> Code {
match self {
Self::UnexistingUpdate(_) => Code::TaskNotFound,
Self::Internal(_) => Code::Internal,
Self::FatalUpdateStoreError => Code::Internal,
Self::DocumentFormatError(error) => error.error_code(),
Self::PayloadTooLarge => Code::PayloadTooLarge,
Self::MissingPayload(_) => Code::MissingPayload,
Self::IndexError(e) => e.error_code(),
}
}
}

View File

@@ -1,19 +0,0 @@
#[derive(thiserror::Error, Debug)]
pub enum VersionFileError {
#[error(
"Meilisearch (v{}) failed to infer the version of the database. Please consider using a dump to load your data.",
env!("CARGO_PKG_VERSION").to_string()
)]
MissingVersionFile,
#[error("Version file is corrupted and thus Meilisearch is unable to determine the version of the database.")]
MalformedVersionFile,
#[error(
"Expected Meilisearch engine version: {major}.{minor}.{patch}, current engine version: {}. To update Meilisearch use a dump.",
env!("CARGO_PKG_VERSION").to_string()
)]
VersionMismatch {
major: String,
minor: String,
patch: String,
},
}

View File

@@ -1,56 +0,0 @@
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use self::error::VersionFileError;
mod error;
pub const VERSION_FILE_NAME: &str = "VERSION";
static VERSION_MAJOR: &str = env!("CARGO_PKG_VERSION_MAJOR");
static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR");
static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH");
// Persists the version of the current Meilisearch binary to a VERSION file
pub fn create_version_file(db_path: &Path) -> anyhow::Result<()> {
let version_path = db_path.join(VERSION_FILE_NAME);
fs::write(
version_path,
format!("{}.{}.{}", VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH),
)?;
Ok(())
}
// Ensures Meilisearch version is compatible with the database, returns an error versions mismatch.
pub fn check_version_file(db_path: &Path) -> anyhow::Result<()> {
let version_path = db_path.join(VERSION_FILE_NAME);
match fs::read_to_string(&version_path) {
Ok(version) => {
let version_components = version.split('.').collect::<Vec<_>>();
let (major, minor, patch) = match &version_components[..] {
[major, minor, patch] => (major.to_string(), minor.to_string(), patch.to_string()),
_ => return Err(VersionFileError::MalformedVersionFile.into()),
};
if major != VERSION_MAJOR || minor != VERSION_MINOR {
return Err(VersionFileError::VersionMismatch {
major,
minor,
patch,
}
.into());
}
}
Err(error) => {
return match error.kind() {
ErrorKind::NotFound => Err(VersionFileError::MissingVersionFile.into()),
_ => Err(error.into()),
}
}
}
Ok(())
}

View File

@@ -1,71 +0,0 @@
use std::fmt;
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::index_uid::IndexUidFormatError;
use meilisearch_types::internal_error;
use tokio::sync::mpsc::error::SendError as MpscSendError;
use tokio::sync::oneshot::error::RecvError as OneshotRecvError;
use uuid::Uuid;
use crate::{error::MilliError, index::error::IndexError, update_file_store::UpdateFileStoreError};
pub type Result<T> = std::result::Result<T, IndexResolverError>;
#[derive(thiserror::Error, Debug)]
pub enum IndexResolverError {
#[error("{0}")]
IndexError(#[from] IndexError),
#[error("Index `{0}` already exists.")]
IndexAlreadyExists(String),
#[error("Index `{0}` not found.")]
UnexistingIndex(String),
#[error("A primary key is already present. It's impossible to update it")]
ExistingPrimaryKey,
#[error("An internal error has occurred. `{0}`.")]
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
#[error("The creation of the `{0}` index has failed due to `Index uuid is already assigned`.")]
UuidAlreadyExists(Uuid),
#[error("{0}")]
Milli(#[from] milli::Error),
#[error("{0}")]
BadlyFormatted(#[from] IndexUidFormatError),
}
impl<T> From<MpscSendError<T>> for IndexResolverError
where
T: Send + Sync + 'static + fmt::Debug,
{
fn from(other: tokio::sync::mpsc::error::SendError<T>) -> Self {
Self::Internal(Box::new(other))
}
}
impl From<OneshotRecvError> for IndexResolverError {
fn from(other: tokio::sync::oneshot::error::RecvError) -> Self {
Self::Internal(Box::new(other))
}
}
internal_error!(
IndexResolverError: milli::heed::Error,
uuid::Error,
std::io::Error,
tokio::task::JoinError,
serde_json::Error,
UpdateFileStoreError
);
impl ErrorCode for IndexResolverError {
fn error_code(&self) -> Code {
match self {
IndexResolverError::IndexError(e) => e.error_code(),
IndexResolverError::IndexAlreadyExists(_) => Code::IndexAlreadyExists,
IndexResolverError::UnexistingIndex(_) => Code::IndexNotFound,
IndexResolverError::ExistingPrimaryKey => Code::PrimaryKeyAlreadyPresent,
IndexResolverError::Internal(_) => Code::Internal,
IndexResolverError::UuidAlreadyExists(_) => Code::CreateIndex,
IndexResolverError::Milli(e) => MilliError(e).error_code(),
IndexResolverError::BadlyFormatted(_) => Code::InvalidIndexUid,
}
}
}

View File

@@ -1,108 +0,0 @@
use std::collections::HashMap;
use std::convert::TryFrom;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use milli::update::IndexerConfig;
use tokio::fs;
use tokio::sync::RwLock;
use tokio::task::spawn_blocking;
use uuid::Uuid;
use super::error::{IndexResolverError, Result};
use crate::index::Index;
use crate::options::IndexerOpts;
type AsyncMap<K, V> = Arc<RwLock<HashMap<K, V>>>;
#[async_trait::async_trait]
#[cfg_attr(test, mockall::automock)]
pub trait IndexStore {
async fn create(&self, uuid: Uuid) -> Result<Index>;
async fn get(&self, uuid: Uuid) -> Result<Option<Index>>;
async fn delete(&self, uuid: Uuid) -> Result<Option<Index>>;
}
pub struct MapIndexStore {
index_store: AsyncMap<Uuid, Index>,
path: PathBuf,
index_size: usize,
indexer_config: Arc<IndexerConfig>,
}
impl MapIndexStore {
pub fn new(
path: impl AsRef<Path>,
index_size: usize,
indexer_opts: &IndexerOpts,
) -> anyhow::Result<Self> {
let indexer_config = Arc::new(IndexerConfig::try_from(indexer_opts)?);
let path = path.as_ref().join("indexes/");
let index_store = Arc::new(RwLock::new(HashMap::new()));
Ok(Self {
index_store,
path,
index_size,
indexer_config,
})
}
}
#[async_trait::async_trait]
impl IndexStore for MapIndexStore {
async fn create(&self, uuid: Uuid) -> Result<Index> {
// We need to keep the lock until we are sure the db file has been opened correclty, to
// ensure that another db is not created at the same time.
let mut lock = self.index_store.write().await;
if let Some(index) = lock.get(&uuid) {
return Ok(index.clone());
}
let path = self.path.join(format!("{}", uuid));
if path.exists() {
return Err(IndexResolverError::UuidAlreadyExists(uuid));
}
let index_size = self.index_size;
let update_handler = self.indexer_config.clone();
let index = spawn_blocking(move || -> Result<Index> {
let index = Index::open(path, index_size, uuid, update_handler)?;
Ok(index)
})
.await??;
lock.insert(uuid, index.clone());
Ok(index)
}
async fn get(&self, uuid: Uuid) -> Result<Option<Index>> {
let guard = self.index_store.read().await;
match guard.get(&uuid) {
Some(index) => Ok(Some(index.clone())),
None => {
// drop the guard here so we can perform the write after without deadlocking;
drop(guard);
let path = self.path.join(format!("{}", uuid));
if !path.exists() {
return Ok(None);
}
let index_size = self.index_size;
let update_handler = self.indexer_config.clone();
let index =
spawn_blocking(move || Index::open(path, index_size, uuid, update_handler))
.await??;
self.index_store.write().await.insert(uuid, index.clone());
Ok(Some(index))
}
}
}
async fn delete(&self, uuid: Uuid) -> Result<Option<Index>> {
let db_path = self.path.join(format!("{}", uuid));
fs::remove_dir_all(db_path).await?;
let index = self.index_store.write().await.remove(&uuid);
Ok(index)
}
}

View File

@@ -1,223 +0,0 @@
use std::collections::HashSet;
use std::fs::{create_dir_all, File};
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use walkdir::WalkDir;
use milli::heed::types::{SerdeBincode, Str};
use milli::heed::{CompactionOption, Database, Env};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use super::error::{IndexResolverError, Result};
use crate::tasks::task::TaskId;
#[derive(Serialize, Deserialize)]
pub struct DumpEntry {
pub uid: String,
pub index_meta: IndexMeta,
}
const UUIDS_DB_PATH: &str = "index_uuids";
#[async_trait::async_trait]
#[cfg_attr(test, mockall::automock)]
pub trait IndexMetaStore: Sized {
// Create a new entry for `name`. Return an error if `err` and the entry already exists, return
// the uuid otherwise.
async fn get(&self, uid: String) -> Result<(String, Option<IndexMeta>)>;
async fn delete(&self, uid: String) -> Result<Option<IndexMeta>>;
async fn list(&self) -> Result<Vec<(String, IndexMeta)>>;
async fn insert(&self, name: String, meta: IndexMeta) -> Result<()>;
async fn snapshot(&self, path: PathBuf) -> Result<HashSet<Uuid>>;
async fn get_size(&self) -> Result<u64>;
async fn dump(&self, path: PathBuf) -> Result<()>;
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IndexMeta {
pub uuid: Uuid,
pub creation_task_id: TaskId,
}
#[derive(Clone)]
pub struct HeedMetaStore {
env: Arc<Env>,
db: Database<Str, SerdeBincode<IndexMeta>>,
}
impl Drop for HeedMetaStore {
fn drop(&mut self) {
if Arc::strong_count(&self.env) == 1 {
self.env.as_ref().clone().prepare_for_closing();
}
}
}
impl HeedMetaStore {
pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
let db = env.create_database(Some("uuids"))?;
Ok(Self { env, db })
}
fn get(&self, name: &str) -> Result<Option<IndexMeta>> {
let env = self.env.clone();
let db = self.db;
let txn = env.read_txn()?;
match db.get(&txn, name)? {
Some(meta) => Ok(Some(meta)),
None => Ok(None),
}
}
fn delete(&self, uid: String) -> Result<Option<IndexMeta>> {
let env = self.env.clone();
let db = self.db;
let mut txn = env.write_txn()?;
match db.get(&txn, &uid)? {
Some(meta) => {
db.delete(&mut txn, &uid)?;
txn.commit()?;
Ok(Some(meta))
}
None => Ok(None),
}
}
fn list(&self) -> Result<Vec<(String, IndexMeta)>> {
let env = self.env.clone();
let db = self.db;
let txn = env.read_txn()?;
let mut entries = Vec::new();
for entry in db.iter(&txn)? {
let (name, meta) = entry?;
entries.push((name.to_string(), meta))
}
Ok(entries)
}
pub(crate) fn insert(&self, name: String, meta: IndexMeta) -> Result<()> {
let env = self.env.clone();
let db = self.db;
let mut txn = env.write_txn()?;
if db.get(&txn, &name)?.is_some() {
return Err(IndexResolverError::IndexAlreadyExists(name));
}
db.put(&mut txn, &name, &meta)?;
txn.commit()?;
Ok(())
}
fn snapshot(&self, mut path: PathBuf) -> Result<HashSet<Uuid>> {
// Write transaction to acquire a lock on the database.
let txn = self.env.write_txn()?;
let mut entries = HashSet::new();
for entry in self.db.iter(&txn)? {
let (_, IndexMeta { uuid, .. }) = entry?;
entries.insert(uuid);
}
// only perform snapshot if there are indexes
if !entries.is_empty() {
path.push(UUIDS_DB_PATH);
create_dir_all(&path).unwrap();
path.push("data.mdb");
self.env.copy_to_path(path, CompactionOption::Enabled)?;
}
Ok(entries)
}
fn get_size(&self) -> Result<u64> {
Ok(WalkDir::new(self.env.path())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len()))
}
pub fn dump(&self, path: PathBuf) -> Result<()> {
let dump_path = path.join(UUIDS_DB_PATH);
create_dir_all(&dump_path)?;
let dump_file_path = dump_path.join("data.jsonl");
let mut dump_file = File::create(&dump_file_path)?;
let txn = self.env.read_txn()?;
for entry in self.db.iter(&txn)? {
let (uid, index_meta) = entry?;
let uid = uid.to_string();
let entry = DumpEntry { uid, index_meta };
serde_json::to_writer(&mut dump_file, &entry)?;
dump_file.write_all(b"\n").unwrap();
}
Ok(())
}
pub fn load_dump(src: impl AsRef<Path>, env: Arc<milli::heed::Env>) -> Result<()> {
let src_indexes = src.as_ref().join(UUIDS_DB_PATH).join("data.jsonl");
let indexes = File::open(&src_indexes)?;
let mut indexes = BufReader::new(indexes);
let mut line = String::new();
let db = Self::new(env)?;
let mut txn = db.env.write_txn()?;
loop {
match indexes.read_line(&mut line) {
Ok(0) => break,
Ok(_) => {
let DumpEntry { uid, index_meta } = serde_json::from_str(&line)?;
db.db.put(&mut txn, &uid, &index_meta)?;
}
Err(e) => return Err(e.into()),
}
line.clear();
}
txn.commit()?;
Ok(())
}
}
#[async_trait::async_trait]
impl IndexMetaStore for HeedMetaStore {
async fn get(&self, name: String) -> Result<(String, Option<IndexMeta>)> {
let this = self.clone();
tokio::task::spawn_blocking(move || this.get(&name).map(|res| (name, res))).await?
}
async fn delete(&self, uid: String) -> Result<Option<IndexMeta>> {
let this = self.clone();
tokio::task::spawn_blocking(move || this.delete(uid)).await?
}
async fn list(&self) -> Result<Vec<(String, IndexMeta)>> {
let this = self.clone();
tokio::task::spawn_blocking(move || this.list()).await?
}
async fn insert(&self, name: String, meta: IndexMeta) -> Result<()> {
let this = self.clone();
tokio::task::spawn_blocking(move || this.insert(name, meta)).await?
}
async fn snapshot(&self, path: PathBuf) -> Result<HashSet<Uuid>> {
let this = self.clone();
tokio::task::spawn_blocking(move || this.snapshot(path)).await?
}
async fn get_size(&self) -> Result<u64> {
self.get_size()
}
async fn dump(&self, path: PathBuf) -> Result<()> {
let this = self.clone();
Ok(tokio::task::spawn_blocking(move || this.dump(path)).await??)
}
}

View File

@@ -1,685 +0,0 @@
pub mod error;
pub mod index_store;
pub mod meta_store;
use std::convert::TryFrom;
use std::path::Path;
use std::sync::Arc;
use error::{IndexResolverError, Result};
use index_store::{IndexStore, MapIndexStore};
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use meta_store::{HeedMetaStore, IndexMetaStore};
use milli::heed::Env;
use milli::update::{DocumentDeletionResult, IndexerConfig};
use time::OffsetDateTime;
use tokio::task::spawn_blocking;
use uuid::Uuid;
use crate::index::{error::Result as IndexResult, Index};
use crate::options::IndexerOpts;
use crate::tasks::task::{DocumentDeletion, Task, TaskContent, TaskEvent, TaskId, TaskResult};
use crate::update_file_store::UpdateFileStore;
use self::meta_store::IndexMeta;
pub type HardStateIndexResolver = IndexResolver<HeedMetaStore, MapIndexStore>;
#[cfg(not(test))]
pub use real::IndexResolver;
#[cfg(test)]
pub use test::MockIndexResolver as IndexResolver;
pub fn create_index_resolver(
path: impl AsRef<Path>,
index_size: usize,
indexer_opts: &IndexerOpts,
meta_env: Arc<milli::heed::Env>,
file_store: UpdateFileStore,
) -> anyhow::Result<HardStateIndexResolver> {
let uuid_store = HeedMetaStore::new(meta_env)?;
let index_store = MapIndexStore::new(&path, index_size, indexer_opts)?;
Ok(IndexResolver::new(uuid_store, index_store, file_store))
}
mod real {
use super::*;
pub struct IndexResolver<U, I> {
pub(super) index_uuid_store: U,
pub(super) index_store: I,
pub(super) file_store: UpdateFileStore,
}
impl IndexResolver<HeedMetaStore, MapIndexStore> {
pub fn load_dump(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
env: Arc<Env>,
indexer_opts: &IndexerOpts,
) -> anyhow::Result<()> {
HeedMetaStore::load_dump(&src, env)?;
let indexes_path = src.as_ref().join("indexes");
let indexes = indexes_path.read_dir()?;
let indexer_config = IndexerConfig::try_from(indexer_opts)?;
for index in indexes {
Index::load_dump(&index?.path(), &dst, index_db_size, &indexer_config)?;
}
Ok(())
}
}
impl<U, I> IndexResolver<U, I>
where
U: IndexMetaStore,
I: IndexStore,
{
pub fn new(index_uuid_store: U, index_store: I, file_store: UpdateFileStore) -> Self {
Self {
index_uuid_store,
index_store,
file_store,
}
}
pub async fn process_document_addition_batch(&self, tasks: &mut [Task]) {
fn get_content_uuid(task: &Task) -> Uuid {
match task {
Task {
content: TaskContent::DocumentAddition { content_uuid, .. },
..
} => *content_uuid,
_ => panic!("unexpected task in the document addition batch"),
}
}
let content_uuids = tasks.iter().map(get_content_uuid).collect::<Vec<_>>();
match tasks.first() {
Some(Task {
id,
content:
TaskContent::DocumentAddition {
merge_strategy,
primary_key,
allow_index_creation,
index_uid,
..
},
..
}) => {
let primary_key = primary_key.clone();
let method = *merge_strategy;
let index = if *allow_index_creation {
self.get_or_create_index(index_uid.clone(), *id).await
} else {
self.get_index(index_uid.as_str().to_string()).await
};
// If the index doesn't exist and we are not allowed to create it with the first
// task, we must fails the whole batch.
let now = OffsetDateTime::now_utc();
let index = match index {
Ok(index) => index,
Err(e) => {
let error = ResponseError::from(e);
for task in tasks.iter_mut() {
task.events.push(TaskEvent::Failed {
error: error.clone(),
timestamp: now,
});
}
return;
}
};
let file_store = self.file_store.clone();
let result = spawn_blocking(move || {
index.update_documents(
method,
primary_key,
file_store,
content_uuids.into_iter(),
)
})
.await;
match result {
Ok(Ok(results)) => {
for (task, result) in tasks.iter_mut().zip(results) {
let event = match result {
Ok(addition) => {
TaskEvent::succeeded(TaskResult::DocumentAddition {
indexed_documents: addition.indexed_documents,
})
}
Err(error) => {
TaskEvent::failed(IndexResolverError::from(error))
}
};
task.events.push(event);
}
}
Ok(Err(e)) => {
let event = TaskEvent::failed(e);
for task in tasks.iter_mut() {
task.events.push(event.clone());
}
}
Err(e) => {
let event = TaskEvent::failed(IndexResolverError::from(e));
for task in tasks.iter_mut() {
task.events.push(event.clone());
}
}
}
}
_ => panic!("invalid batch!"),
}
}
pub async fn delete_content_file(&self, content_uuid: Uuid) -> Result<()> {
self.file_store.delete(content_uuid).await?;
Ok(())
}
async fn process_task_inner(&self, task: &Task) -> Result<TaskResult> {
match &task.content {
TaskContent::DocumentAddition { .. } => {
panic!("updates should be handled by batch")
}
TaskContent::DocumentDeletion {
deletion: DocumentDeletion::Ids(ids),
index_uid,
} => {
let ids = ids.clone();
let index = self.get_index(index_uid.clone().into_inner()).await?;
let DocumentDeletionResult {
deleted_documents, ..
} = spawn_blocking(move || index.delete_documents(&ids)).await??;
Ok(TaskResult::DocumentDeletion { deleted_documents })
}
TaskContent::DocumentDeletion {
deletion: DocumentDeletion::Clear,
index_uid,
} => {
let index = self.get_index(index_uid.clone().into_inner()).await?;
let deleted_documents = spawn_blocking(move || -> IndexResult<u64> {
let number_documents = index.stats()?.number_of_documents;
index.clear_documents()?;
Ok(number_documents)
})
.await??;
Ok(TaskResult::ClearAll { deleted_documents })
}
TaskContent::SettingsUpdate {
settings,
is_deletion,
allow_index_creation,
index_uid,
} => {
let index = if *is_deletion || !*allow_index_creation {
self.get_index(index_uid.clone().into_inner()).await?
} else {
self.get_or_create_index(index_uid.clone(), task.id).await?
};
let settings = settings.clone();
spawn_blocking(move || index.update_settings(&settings.check())).await??;
Ok(TaskResult::Other)
}
TaskContent::IndexDeletion { index_uid } => {
let index = self.delete_index(index_uid.clone().into_inner()).await?;
let deleted_documents = spawn_blocking(move || -> IndexResult<u64> {
Ok(index.stats()?.number_of_documents)
})
.await??;
Ok(TaskResult::ClearAll { deleted_documents })
}
TaskContent::IndexCreation {
primary_key,
index_uid,
} => {
let index = self.create_index(index_uid.clone(), task.id).await?;
if let Some(primary_key) = primary_key {
let primary_key = primary_key.clone();
spawn_blocking(move || index.update_primary_key(primary_key)).await??;
}
Ok(TaskResult::Other)
}
TaskContent::IndexUpdate {
primary_key,
index_uid,
} => {
let index = self.get_index(index_uid.clone().into_inner()).await?;
if let Some(primary_key) = primary_key {
let primary_key = primary_key.clone();
spawn_blocking(move || index.update_primary_key(primary_key)).await??;
}
Ok(TaskResult::Other)
}
_ => unreachable!("Invalid task for index resolver"),
}
}
pub async fn process_task(&self, task: &mut Task) {
match self.process_task_inner(task).await {
Ok(res) => task.events.push(TaskEvent::succeeded(res)),
Err(e) => task.events.push(TaskEvent::failed(e)),
}
}
pub async fn dump(&self, path: impl AsRef<Path>) -> Result<()> {
for (_, index) in self.list().await? {
index.dump(&path)?;
}
self.index_uuid_store.dump(path.as_ref().to_owned()).await?;
Ok(())
}
async fn create_index(&self, uid: IndexUid, creation_task_id: TaskId) -> Result<Index> {
match self.index_uuid_store.get(uid.into_inner()).await? {
(uid, Some(_)) => Err(IndexResolverError::IndexAlreadyExists(uid)),
(uid, None) => {
let uuid = Uuid::new_v4();
let index = self.index_store.create(uuid).await?;
match self
.index_uuid_store
.insert(
uid,
IndexMeta {
uuid,
creation_task_id,
},
)
.await
{
Err(e) => {
match self.index_store.delete(uuid).await {
Ok(Some(index)) => {
index.close();
}
Ok(None) => (),
Err(e) => log::error!("Error while deleting index: {:?}", e),
}
Err(e)
}
Ok(()) => Ok(index),
}
}
}
}
/// Get or create an index with name `uid`.
pub async fn get_or_create_index(&self, uid: IndexUid, task_id: TaskId) -> Result<Index> {
match self.create_index(uid, task_id).await {
Ok(index) => Ok(index),
Err(IndexResolverError::IndexAlreadyExists(uid)) => self.get_index(uid).await,
Err(e) => Err(e),
}
}
pub async fn list(&self) -> Result<Vec<(String, Index)>> {
let uuids = self.index_uuid_store.list().await?;
let mut indexes = Vec::new();
for (name, IndexMeta { uuid, .. }) in uuids {
match self.index_store.get(uuid).await? {
Some(index) => indexes.push((name, index)),
None => {
// we found an unexisting index, we remove it from the uuid store
let _ = self.index_uuid_store.delete(name).await;
}
}
}
Ok(indexes)
}
pub async fn delete_index(&self, uid: String) -> Result<Index> {
match self.index_uuid_store.delete(uid.clone()).await? {
Some(IndexMeta { uuid, .. }) => match self.index_store.delete(uuid).await? {
Some(index) => {
index.clone().close();
Ok(index)
}
None => Err(IndexResolverError::UnexistingIndex(uid)),
},
None => Err(IndexResolverError::UnexistingIndex(uid)),
}
}
pub async fn get_index(&self, uid: String) -> Result<Index> {
match self.index_uuid_store.get(uid).await? {
(name, Some(IndexMeta { uuid, .. })) => {
match self.index_store.get(uuid).await? {
Some(index) => Ok(index),
None => {
// For some reason we got a uuid to an unexisting index, we return an error,
// and remove the uuid from the uuid store.
let _ = self.index_uuid_store.delete(name.clone()).await;
Err(IndexResolverError::UnexistingIndex(name))
}
}
}
(name, _) => Err(IndexResolverError::UnexistingIndex(name)),
}
}
pub async fn get_index_creation_task_id(&self, index_uid: String) -> Result<TaskId> {
let (uid, meta) = self.index_uuid_store.get(index_uid).await?;
meta.map(
|IndexMeta {
creation_task_id, ..
}| creation_task_id,
)
.ok_or(IndexResolverError::UnexistingIndex(uid))
}
}
}
#[cfg(test)]
mod test {
use crate::index::IndexStats;
use super::index_store::MockIndexStore;
use super::meta_store::MockIndexMetaStore;
use super::*;
use futures::future::ok;
use milli::FieldDistribution;
use nelson::Mocker;
pub enum MockIndexResolver<U, I> {
Real(super::real::IndexResolver<U, I>),
Mock(Mocker),
}
impl MockIndexResolver<HeedMetaStore, MapIndexStore> {
pub fn load_dump(
src: impl AsRef<Path>,
dst: impl AsRef<Path>,
index_db_size: usize,
env: Arc<Env>,
indexer_opts: &IndexerOpts,
) -> anyhow::Result<()> {
super::real::IndexResolver::load_dump(src, dst, index_db_size, env, indexer_opts)
}
}
impl<U, I> MockIndexResolver<U, I>
where
U: IndexMetaStore,
I: IndexStore,
{
pub fn new(index_uuid_store: U, index_store: I, file_store: UpdateFileStore) -> Self {
Self::Real(super::real::IndexResolver {
index_uuid_store,
index_store,
file_store,
})
}
pub fn mock(mocker: Mocker) -> Self {
Self::Mock(mocker)
}
pub async fn process_document_addition_batch(&self, tasks: &mut [Task]) {
match self {
IndexResolver::Real(r) => r.process_document_addition_batch(tasks).await,
IndexResolver::Mock(m) => unsafe {
m.get("process_document_addition_batch").call(tasks)
},
}
}
pub async fn process_task(&self, task: &mut Task) {
match self {
IndexResolver::Real(r) => r.process_task(task).await,
IndexResolver::Mock(m) => unsafe { m.get("process_task").call(task) },
}
}
pub async fn dump(&self, path: impl AsRef<Path>) -> Result<()> {
match self {
IndexResolver::Real(r) => r.dump(path).await,
IndexResolver::Mock(_) => todo!(),
}
}
/// Get or create an index with name `uid`.
pub async fn get_or_create_index(&self, uid: IndexUid, task_id: TaskId) -> Result<Index> {
match self {
IndexResolver::Real(r) => r.get_or_create_index(uid, task_id).await,
IndexResolver::Mock(_) => todo!(),
}
}
pub async fn list(&self) -> Result<Vec<(String, Index)>> {
match self {
IndexResolver::Real(r) => r.list().await,
IndexResolver::Mock(_) => todo!(),
}
}
pub async fn delete_index(&self, uid: String) -> Result<Index> {
match self {
IndexResolver::Real(r) => r.delete_index(uid).await,
IndexResolver::Mock(_) => todo!(),
}
}
pub async fn get_index(&self, uid: String) -> Result<Index> {
match self {
IndexResolver::Real(r) => r.get_index(uid).await,
IndexResolver::Mock(_) => todo!(),
}
}
pub async fn get_index_creation_task_id(&self, index_uid: String) -> Result<TaskId> {
match self {
IndexResolver::Real(r) => r.get_index_creation_task_id(index_uid).await,
IndexResolver::Mock(_) => todo!(),
}
}
pub async fn delete_content_file(&self, content_uuid: Uuid) -> Result<()> {
match self {
IndexResolver::Real(r) => r.delete_content_file(content_uuid).await,
IndexResolver::Mock(m) => unsafe {
m.get("delete_content_file").call(content_uuid)
},
}
}
}
#[actix_rt::test]
async fn test_remove_unknown_index() {
let mut meta_store = MockIndexMetaStore::new();
meta_store
.expect_delete()
.once()
.returning(|_| Box::pin(ok(None)));
let index_store = MockIndexStore::new();
let mocker = Mocker::default();
let file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, file_store);
let mut task = Task {
id: 1,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test"),
},
events: Vec::new(),
};
index_resolver.process_task(&mut task).await;
assert!(matches!(task.events[0], TaskEvent::Failed { .. }));
}
#[actix_rt::test]
async fn test_remove_index() {
let mut meta_store = MockIndexMetaStore::new();
meta_store.expect_delete().once().returning(|_| {
Box::pin(ok(Some(IndexMeta {
uuid: Uuid::new_v4(),
creation_task_id: 1,
})))
});
let mut index_store = MockIndexStore::new();
index_store.expect_delete().once().returning(|_| {
let mocker = Mocker::default();
mocker.when::<(), ()>("close").then(|_| ());
mocker
.when::<(), IndexResult<IndexStats>>("stats")
.then(|_| {
Ok(IndexStats {
size: 10,
number_of_documents: 10,
is_indexing: None,
field_distribution: FieldDistribution::default(),
})
});
Box::pin(ok(Some(Index::mock(mocker))))
});
let mocker = Mocker::default();
let file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, file_store);
let mut task = Task {
id: 1,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test"),
},
events: Vec::new(),
};
index_resolver.process_task(&mut task).await;
assert!(matches!(task.events[0], TaskEvent::Succeeded { .. }));
}
#[actix_rt::test]
async fn test_delete_documents() {
let mut meta_store = MockIndexMetaStore::new();
meta_store.expect_get().once().returning(|_| {
Box::pin(ok((
"test".to_string(),
Some(IndexMeta {
uuid: Uuid::new_v4(),
creation_task_id: 1,
}),
)))
});
let mut index_store = MockIndexStore::new();
index_store.expect_get().once().returning(|_| {
let mocker = Mocker::default();
mocker
.when::<(), IndexResult<()>>("clear_documents")
.once()
.then(|_| Ok(()));
mocker
.when::<(), IndexResult<IndexStats>>("stats")
.once()
.then(|_| {
Ok(IndexStats {
size: 10,
number_of_documents: 10,
is_indexing: None,
field_distribution: FieldDistribution::default(),
})
});
Box::pin(ok(Some(Index::mock(mocker))))
});
let mocker = Mocker::default();
let file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, file_store);
let mut task = Task {
id: 1,
content: TaskContent::DocumentDeletion {
deletion: DocumentDeletion::Clear,
index_uid: IndexUid::new_unchecked("test"),
},
events: Vec::new(),
};
index_resolver.process_task(&mut task).await;
assert!(matches!(task.events[0], TaskEvent::Succeeded { .. }));
}
#[actix_rt::test]
async fn test_index_update() {
let mut meta_store = MockIndexMetaStore::new();
meta_store.expect_get().once().returning(|_| {
Box::pin(ok((
"test".to_string(),
Some(IndexMeta {
uuid: Uuid::new_v4(),
creation_task_id: 1,
}),
)))
});
let mut index_store = MockIndexStore::new();
index_store.expect_get().once().returning(|_| {
let mocker = Mocker::default();
mocker
.when::<String, IndexResult<crate::index::IndexMeta>>("update_primary_key")
.once()
.then(|_| {
Ok(crate::index::IndexMeta {
created_at: OffsetDateTime::now_utc(),
updated_at: OffsetDateTime::now_utc(),
primary_key: Some("key".to_string()),
})
});
Box::pin(ok(Some(Index::mock(mocker))))
});
let mocker = Mocker::default();
let file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, file_store);
let mut task = Task {
id: 1,
content: TaskContent::IndexUpdate {
primary_key: Some("key".to_string()),
index_uid: IndexUid::new_unchecked("test"),
},
events: Vec::new(),
};
index_resolver.process_task(&mut task).await;
assert!(matches!(task.events[0], TaskEvent::Succeeded { .. }));
}
}

View File

@@ -1,37 +0,0 @@
#[macro_use]
pub mod error;
pub mod options;
mod analytics;
mod dump;
pub mod index;
pub mod index_controller;
mod index_resolver;
mod snapshot;
pub mod tasks;
mod update_file_store;
use std::path::Path;
pub use index_controller::MeiliSearch;
pub use milli;
pub use milli::heed;
mod compression;
pub mod document_formats;
/// Check if a db is empty. It does not provide any information on the
/// validity of the data in it.
/// We consider a database as non empty when it's a non empty directory.
pub fn is_empty_db(db_path: impl AsRef<Path>) -> bool {
let db_path = db_path.as_ref();
if !db_path.exists() {
true
// if we encounter an error or if the db is a file we consider the db non empty
} else if let Ok(dir) = db_path.read_dir() {
dir.count() == 0
} else {
true
}
}

View File

@@ -1,166 +0,0 @@
use core::fmt;
use std::{convert::TryFrom, num::ParseIntError, ops::Deref, str::FromStr};
use byte_unit::{Byte, ByteError};
use clap::Parser;
use milli::update::IndexerConfig;
use serde::Serialize;
use sysinfo::{RefreshKind, System, SystemExt};
#[derive(Debug, Clone, Parser, Serialize)]
pub struct IndexerOpts {
/// The amount of documents to skip before printing
/// a log regarding the indexing advancement.
#[serde(skip)]
#[clap(long, default_value = "100000", hide = true)] // 100k
pub log_every_n: usize,
/// Grenad max number of chunks in bytes.
#[serde(skip)]
#[clap(long, hide = true)]
pub max_nb_chunks: Option<usize>,
/// The maximum amount of memory the indexer will use. It defaults to 2/3
/// of the available memory. It is recommended to use something like 80%-90%
/// of the available memory, no more.
///
/// In case the engine is unable to retrieve the available memory the engine will
/// try to use the memory it needs but without real limit, this can lead to
/// Out-Of-Memory issues and it is recommended to specify the amount of memory to use.
#[clap(long, env = "MEILI_MAX_INDEXING_MEMORY", default_value_t)]
pub max_indexing_memory: MaxMemory,
/// The maximum number of threads the indexer will use.
/// If the number set is higher than the real number of cores available in the machine,
/// it will use the maximum number of available cores.
///
/// It defaults to half of the available threads.
#[clap(long, env = "MEILI_MAX_INDEXING_THREADS", default_value_t)]
pub max_indexing_threads: MaxThreads,
}
#[derive(Debug, Clone, Parser, Default, Serialize)]
pub struct SchedulerConfig {
/// The engine will disable task auto-batching,
/// and will sequencialy compute each task one by one.
#[clap(long, env = "DISABLE_AUTO_BATCHING")]
pub disable_auto_batching: bool,
}
impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = rayon::ThreadPoolBuilder::new()
.num_threads(*other.max_indexing_threads)
.build()?;
Ok(Self {
log_every_n: Some(other.log_every_n),
max_nb_chunks: other.max_nb_chunks,
max_memory: other.max_indexing_memory.map(|b| b.get_bytes() as usize),
thread_pool: Some(thread_pool),
max_positions_per_attributes: None,
..Default::default()
})
}
}
impl Default for IndexerOpts {
fn default() -> Self {
Self {
log_every_n: 100_000,
max_nb_chunks: None,
max_indexing_memory: MaxMemory::default(),
max_indexing_threads: MaxThreads::default(),
}
}
}
/// A type used to detect the max memory available and use 2/3 of it.
#[derive(Debug, Clone, Copy, Serialize)]
pub struct MaxMemory(Option<Byte>);
impl FromStr for MaxMemory {
type Err = ByteError;
fn from_str(s: &str) -> Result<MaxMemory, ByteError> {
Byte::from_str(s).map(Some).map(MaxMemory)
}
}
impl Default for MaxMemory {
fn default() -> MaxMemory {
MaxMemory(
total_memory_bytes()
.map(|bytes| bytes * 2 / 3)
.map(Byte::from_bytes),
)
}
}
impl fmt::Display for MaxMemory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.0 {
Some(memory) => write!(f, "{}", memory.get_appropriate_unit(true)),
None => f.write_str("unknown"),
}
}
}
impl Deref for MaxMemory {
type Target = Option<Byte>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl MaxMemory {
pub fn unlimited() -> Self {
Self(None)
}
}
/// Returns the total amount of bytes available or `None` if this system isn't supported.
fn total_memory_bytes() -> Option<u64> {
if System::IS_SUPPORTED {
let memory_kind = RefreshKind::new().with_memory();
let mut system = System::new_with_specifics(memory_kind);
system.refresh_memory();
Some(system.total_memory() * 1024) // KiB into bytes
} else {
None
}
}
#[derive(Debug, Clone, Copy, Serialize)]
pub struct MaxThreads(usize);
impl FromStr for MaxThreads {
type Err = ParseIntError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
usize::from_str(s).map(Self)
}
}
impl Default for MaxThreads {
fn default() -> Self {
MaxThreads(num_cpus::get() / 2)
}
}
impl fmt::Display for MaxThreads {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.0)
}
}
impl Deref for MaxThreads {
type Target = usize;
fn deref(&self) -> &Self::Target {
&self.0
}
}

View File

@@ -1,202 +0,0 @@
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
use anyhow::bail;
use fs_extra::dir::{self, CopyOptions};
use log::{info, trace};
use meilisearch_auth::open_auth_store_env;
use milli::heed::CompactionOption;
use tokio::sync::RwLock;
use tokio::time::sleep;
use walkdir::WalkDir;
use crate::compression::from_tar_gz;
use crate::index_controller::open_meta_env;
use crate::index_controller::versioning::VERSION_FILE_NAME;
use crate::tasks::Scheduler;
pub struct SnapshotService {
pub(crate) db_path: PathBuf,
pub(crate) snapshot_period: Duration,
pub(crate) snapshot_path: PathBuf,
pub(crate) index_size: usize,
pub(crate) meta_env_size: usize,
pub(crate) scheduler: Arc<RwLock<Scheduler>>,
}
impl SnapshotService {
pub async fn run(self) {
info!(
"Snapshot scheduled every {}s.",
self.snapshot_period.as_secs()
);
loop {
let snapshot_job = SnapshotJob {
dest_path: self.snapshot_path.clone(),
src_path: self.db_path.clone(),
meta_env_size: self.meta_env_size,
index_size: self.index_size,
};
self.scheduler.write().await.schedule_snapshot(snapshot_job);
sleep(self.snapshot_period).await;
}
}
}
pub fn load_snapshot(
db_path: impl AsRef<Path>,
snapshot_path: impl AsRef<Path>,
ignore_snapshot_if_db_exists: bool,
ignore_missing_snapshot: bool,
) -> anyhow::Result<()> {
let empty_db = crate::is_empty_db(&db_path);
let snapshot_path_exists = snapshot_path.as_ref().exists();
if empty_db && snapshot_path_exists {
match from_tar_gz(snapshot_path, &db_path) {
Ok(()) => Ok(()),
Err(e) => {
//clean created db folder
std::fs::remove_dir_all(&db_path)?;
Err(e)
}
}
} else if !empty_db && !ignore_snapshot_if_db_exists {
bail!(
"database already exists at {:?}, try to delete it or rename it",
db_path
.as_ref()
.canonicalize()
.unwrap_or_else(|_| db_path.as_ref().to_owned())
)
} else if !snapshot_path_exists && !ignore_missing_snapshot {
bail!("snapshot doesn't exist at {:?}", snapshot_path.as_ref())
} else {
Ok(())
}
}
#[derive(Debug)]
pub struct SnapshotJob {
dest_path: PathBuf,
src_path: PathBuf,
meta_env_size: usize,
index_size: usize,
}
impl SnapshotJob {
pub async fn run(self) -> anyhow::Result<()> {
tokio::task::spawn_blocking(|| self.run_sync()).await??;
Ok(())
}
fn run_sync(self) -> anyhow::Result<()> {
trace!("Performing snapshot.");
let snapshot_dir = self.dest_path.clone();
std::fs::create_dir_all(&snapshot_dir)?;
let temp_snapshot_dir = tempfile::tempdir()?;
let temp_snapshot_path = temp_snapshot_dir.path();
self.snapshot_version_file(temp_snapshot_path)?;
self.snapshot_meta_env(temp_snapshot_path)?;
self.snapshot_file_store(temp_snapshot_path)?;
self.snapshot_indexes(temp_snapshot_path)?;
self.snapshot_auth(temp_snapshot_path)?;
let db_name = self
.src_path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("data.ms")
.to_string();
let snapshot_path = self.dest_path.join(format!("{}.snapshot", db_name));
let temp_snapshot_file = tempfile::NamedTempFile::new_in(&snapshot_dir)?;
let temp_snapshot_file_path = temp_snapshot_file.path().to_owned();
crate::compression::to_tar_gz(temp_snapshot_path, temp_snapshot_file_path)?;
let _file = temp_snapshot_file.persist(&snapshot_path)?;
#[cfg(unix)]
{
use std::fs::Permissions;
use std::os::unix::fs::PermissionsExt;
let perm = Permissions::from_mode(0o644);
_file.set_permissions(perm)?;
}
trace!("Created snapshot in {:?}.", snapshot_path);
Ok(())
}
fn snapshot_version_file(&self, path: &Path) -> anyhow::Result<()> {
let dst = path.join(VERSION_FILE_NAME);
let src = self.src_path.join(VERSION_FILE_NAME);
fs::copy(src, dst)?;
Ok(())
}
fn snapshot_meta_env(&self, path: &Path) -> anyhow::Result<()> {
let env = open_meta_env(&self.src_path, self.meta_env_size)?;
let dst = path.join("data.mdb");
env.copy_to_path(dst, milli::heed::CompactionOption::Enabled)?;
Ok(())
}
fn snapshot_file_store(&self, path: &Path) -> anyhow::Result<()> {
// for now we simply copy the updates/updates_files
// FIXME(marin): We may copy more files than necessary, if new files are added while we are
// performing the snapshop. We need a way to filter them out.
let dst = path.join("updates");
fs::create_dir_all(&dst)?;
let options = CopyOptions::default();
dir::copy(self.src_path.join("updates/updates_files"), dst, &options)?;
Ok(())
}
fn snapshot_indexes(&self, path: &Path) -> anyhow::Result<()> {
let indexes_path = self.src_path.join("indexes/");
let dst = path.join("indexes/");
for entry in WalkDir::new(indexes_path).max_depth(1).into_iter().skip(1) {
let entry = entry?;
let name = entry.file_name();
let dst = dst.join(name);
std::fs::create_dir_all(&dst)?;
let dst = dst.join("data.mdb");
let mut options = milli::heed::EnvOpenOptions::new();
options.map_size(self.index_size);
let index = milli::Index::new(options, entry.path())?;
index.copy_to_path(dst, CompactionOption::Enabled)?;
}
Ok(())
}
fn snapshot_auth(&self, path: &Path) -> anyhow::Result<()> {
let auth_path = self.src_path.join("auth");
let dst = path.join("auth");
std::fs::create_dir_all(&dst)?;
let dst = dst.join("data.mdb");
let env = open_auth_store_env(&auth_path)?;
env.copy_to_path(dst, milli::heed::CompactionOption::Enabled)?;
Ok(())
}
}

View File

@@ -1,75 +0,0 @@
use time::OffsetDateTime;
use crate::snapshot::SnapshotJob;
use super::task::{Task, TaskEvent};
pub type BatchId = u32;
#[derive(Debug)]
pub enum BatchContent {
DocumentsAdditionBatch(Vec<Task>),
IndexUpdate(Task),
Dump(Task),
Snapshot(SnapshotJob),
// Symbolizes a empty batch. This can occur when we were woken, but there wasn't any work to do.
Empty,
}
impl BatchContent {
pub fn first(&self) -> Option<&Task> {
match self {
BatchContent::DocumentsAdditionBatch(ts) => ts.first(),
BatchContent::Dump(t) | BatchContent::IndexUpdate(t) => Some(t),
BatchContent::Snapshot(_) | BatchContent::Empty => None,
}
}
pub fn push_event(&mut self, event: TaskEvent) {
match self {
BatchContent::DocumentsAdditionBatch(ts) => {
ts.iter_mut().for_each(|t| t.events.push(event.clone()))
}
BatchContent::IndexUpdate(t) | BatchContent::Dump(t) => t.events.push(event),
BatchContent::Snapshot(_) | BatchContent::Empty => (),
}
}
}
#[derive(Debug)]
pub struct Batch {
// Only batches that contains a persistent tasks are given an id. Snapshot batches don't have
// an id.
pub id: Option<BatchId>,
pub created_at: OffsetDateTime,
pub content: BatchContent,
}
impl Batch {
pub fn new(id: Option<BatchId>, content: BatchContent) -> Self {
Self {
id,
created_at: OffsetDateTime::now_utc(),
content,
}
}
pub fn len(&self) -> usize {
match self.content {
BatchContent::DocumentsAdditionBatch(ref ts) => ts.len(),
BatchContent::IndexUpdate(_) | BatchContent::Dump(_) | BatchContent::Snapshot(_) => 1,
BatchContent::Empty => 0,
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn empty() -> Self {
Self {
id: None,
created_at: OffsetDateTime::now_utc(),
content: BatchContent::Empty,
}
}
}

View File

@@ -1,34 +0,0 @@
use meilisearch_types::error::{Code, ErrorCode};
use meilisearch_types::internal_error;
use tokio::task::JoinError;
use crate::update_file_store::UpdateFileStoreError;
use super::task::TaskId;
pub type Result<T> = std::result::Result<T, TaskError>;
#[derive(Debug, thiserror::Error)]
pub enum TaskError {
#[error("Task `{0}` not found.")]
UnexistingTask(TaskId),
#[error("Internal error: {0}")]
Internal(Box<dyn std::error::Error + Send + Sync + 'static>),
}
internal_error!(
TaskError: milli::heed::Error,
JoinError,
std::io::Error,
serde_json::Error,
UpdateFileStoreError
);
impl ErrorCode for TaskError {
fn error_code(&self) -> Code {
match self {
TaskError::UnexistingTask(_) => Code::TaskNotFound,
TaskError::Internal(_) => Code::Internal,
}
}
}

View File

@@ -1,132 +0,0 @@
use crate::dump::DumpHandler;
use crate::index_resolver::index_store::IndexStore;
use crate::index_resolver::meta_store::IndexMetaStore;
use crate::tasks::batch::{Batch, BatchContent};
use crate::tasks::task::{Task, TaskContent, TaskEvent, TaskResult};
use crate::tasks::BatchHandler;
#[async_trait::async_trait]
impl<U, I> BatchHandler for DumpHandler<U, I>
where
U: IndexMetaStore + Sync + Send + 'static,
I: IndexStore + Sync + Send + 'static,
{
fn accept(&self, batch: &Batch) -> bool {
matches!(batch.content, BatchContent::Dump { .. })
}
async fn process_batch(&self, mut batch: Batch) -> Batch {
match &batch.content {
BatchContent::Dump(Task {
content: TaskContent::Dump { uid },
..
}) => {
match self.run(uid.clone()).await {
Ok(_) => {
batch
.content
.push_event(TaskEvent::succeeded(TaskResult::Other));
}
Err(e) => batch.content.push_event(TaskEvent::failed(e)),
}
batch
}
_ => unreachable!("invalid batch content for dump"),
}
}
async fn finish(&self, _: &Batch) {}
}
#[cfg(test)]
mod test {
use crate::dump::error::{DumpError, Result as DumpResult};
use crate::index_resolver::{index_store::MockIndexStore, meta_store::MockIndexMetaStore};
use crate::tasks::handlers::test::task_to_batch;
use super::*;
use nelson::Mocker;
use proptest::prelude::*;
proptest! {
#[test]
fn finish_does_nothing(
task in any::<Task>(),
) {
let rt = tokio::runtime::Runtime::new().unwrap();
let handle = rt.spawn(async {
let batch = task_to_batch(task);
let mocker = Mocker::default();
let dump_handler = DumpHandler::<MockIndexMetaStore, MockIndexStore>::mock(mocker);
dump_handler.finish(&batch).await;
});
rt.block_on(handle).unwrap();
}
#[test]
fn test_handle_dump_success(
task in any::<Task>(),
) {
let rt = tokio::runtime::Runtime::new().unwrap();
let handle = rt.spawn(async {
let batch = task_to_batch(task);
let should_accept = matches!(batch.content, BatchContent::Dump { .. });
let mocker = Mocker::default();
if should_accept {
mocker.when::<String, DumpResult<()>>("run")
.once()
.then(|_| Ok(()));
}
let dump_handler = DumpHandler::<MockIndexMetaStore, MockIndexStore>::mock(mocker);
let accept = dump_handler.accept(&batch);
assert_eq!(accept, should_accept);
if accept {
let batch = dump_handler.process_batch(batch).await;
let last_event = batch.content.first().unwrap().events.last().unwrap();
assert!(matches!(last_event, TaskEvent::Succeeded { .. }));
}
});
rt.block_on(handle).unwrap();
}
#[test]
fn test_handle_dump_error(
task in any::<Task>(),
) {
let rt = tokio::runtime::Runtime::new().unwrap();
let handle = rt.spawn(async {
let batch = task_to_batch(task);
let should_accept = matches!(batch.content, BatchContent::Dump { .. });
let mocker = Mocker::default();
if should_accept {
mocker.when::<String, DumpResult<()>>("run")
.once()
.then(|_| Err(DumpError::Internal("error".into())));
}
let dump_handler = DumpHandler::<MockIndexMetaStore, MockIndexStore>::mock(mocker);
let accept = dump_handler.accept(&batch);
assert_eq!(accept, should_accept);
if accept {
let batch = dump_handler.process_batch(batch).await;
let last_event = batch.content.first().unwrap().events.last().unwrap();
assert!(matches!(last_event, TaskEvent::Failed { .. }));
}
});
rt.block_on(handle).unwrap();
}
}
}

View File

@@ -1,18 +0,0 @@
use crate::tasks::batch::{Batch, BatchContent};
use crate::tasks::BatchHandler;
/// A sink handler for empty tasks.
pub struct EmptyBatchHandler;
#[async_trait::async_trait]
impl BatchHandler for EmptyBatchHandler {
fn accept(&self, batch: &Batch) -> bool {
matches!(batch.content, BatchContent::Empty)
}
async fn process_batch(&self, batch: Batch) -> Batch {
batch
}
async fn finish(&self, _: &Batch) {}
}

View File

@@ -1,199 +0,0 @@
use crate::index_resolver::IndexResolver;
use crate::index_resolver::{index_store::IndexStore, meta_store::IndexMetaStore};
use crate::tasks::batch::{Batch, BatchContent};
use crate::tasks::BatchHandler;
#[async_trait::async_trait]
impl<U, I> BatchHandler for IndexResolver<U, I>
where
U: IndexMetaStore + Send + Sync + 'static,
I: IndexStore + Send + Sync + 'static,
{
fn accept(&self, batch: &Batch) -> bool {
matches!(
batch.content,
BatchContent::DocumentsAdditionBatch(_) | BatchContent::IndexUpdate(_)
)
}
async fn process_batch(&self, mut batch: Batch) -> Batch {
match batch.content {
BatchContent::DocumentsAdditionBatch(ref mut tasks) => {
self.process_document_addition_batch(tasks).await;
}
BatchContent::IndexUpdate(ref mut task) => {
self.process_task(task).await;
}
_ => unreachable!(),
}
batch
}
async fn finish(&self, batch: &Batch) {
if let BatchContent::DocumentsAdditionBatch(ref tasks) = batch.content {
for task in tasks {
if let Some(content_uuid) = task.get_content_uuid() {
if let Err(e) = self.delete_content_file(content_uuid).await {
log::error!("error deleting update file: {}", e);
}
}
}
}
}
}
#[cfg(test)]
mod test {
use crate::index_resolver::index_store::MapIndexStore;
use crate::index_resolver::meta_store::HeedMetaStore;
use crate::index_resolver::{
error::Result as IndexResult, index_store::MockIndexStore, meta_store::MockIndexMetaStore,
};
use crate::tasks::{
handlers::test::task_to_batch,
task::{Task, TaskContent},
};
use crate::update_file_store::{Result as FileStoreResult, UpdateFileStore};
use super::*;
use meilisearch_types::index_uid::IndexUid;
use milli::update::IndexDocumentsMethod;
use nelson::Mocker;
use proptest::prelude::*;
use uuid::Uuid;
proptest! {
#[test]
fn test_accept_task(
task in any::<Task>(),
) {
let batch = task_to_batch(task);
let index_store = MockIndexStore::new();
let meta_store = MockIndexMetaStore::new();
let mocker = Mocker::default();
let update_file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, update_file_store);
match batch.content {
BatchContent::DocumentsAdditionBatch(_)
| BatchContent::IndexUpdate(_) => assert!(index_resolver.accept(&batch)),
BatchContent::Dump(_)
| BatchContent::Snapshot(_)
| BatchContent::Empty => assert!(!index_resolver.accept(&batch)),
}
}
}
#[actix_rt::test]
async fn finisher_called_on_document_update() {
let index_store = MockIndexStore::new();
let meta_store = MockIndexMetaStore::new();
let mocker = Mocker::default();
let content_uuid = Uuid::new_v4();
mocker
.when::<Uuid, FileStoreResult<()>>("delete")
.once()
.then(move |uuid| {
assert_eq!(uuid, content_uuid);
Ok(())
});
let update_file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, update_file_store);
let task = Task {
id: 1,
content: TaskContent::DocumentAddition {
content_uuid,
merge_strategy: IndexDocumentsMethod::ReplaceDocuments,
primary_key: None,
documents_count: 100,
allow_index_creation: true,
index_uid: IndexUid::new_unchecked("test"),
},
events: Vec::new(),
};
let batch = task_to_batch(task);
index_resolver.finish(&batch).await;
}
#[actix_rt::test]
#[should_panic]
async fn panic_when_passed_unsupported_batch() {
let index_store = MockIndexStore::new();
let meta_store = MockIndexMetaStore::new();
let mocker = Mocker::default();
let update_file_store = UpdateFileStore::mock(mocker);
let index_resolver = IndexResolver::new(meta_store, index_store, update_file_store);
let task = Task {
id: 1,
content: TaskContent::Dump {
uid: String::from("hello"),
},
events: Vec::new(),
};
let batch = task_to_batch(task);
index_resolver.process_batch(batch).await;
}
proptest! {
#[test]
fn index_document_task_deletes_update_file(
task in any::<Task>(),
) {
let rt = tokio::runtime::Runtime::new().unwrap();
let handle = rt.spawn(async {
let mocker = Mocker::default();
if let TaskContent::DocumentAddition{ .. } = task.content {
mocker.when::<Uuid, IndexResult<()>>("delete_content_file").then(|_| Ok(()));
}
let index_resolver: IndexResolver<HeedMetaStore, MapIndexStore> = IndexResolver::mock(mocker);
let batch = task_to_batch(task);
index_resolver.finish(&batch).await;
});
rt.block_on(handle).unwrap();
}
#[test]
fn test_handle_batch(task in any::<Task>()) {
let rt = tokio::runtime::Runtime::new().unwrap();
let handle = rt.spawn(async {
let mocker = Mocker::default();
match task.content {
TaskContent::DocumentAddition { .. } => {
mocker.when::<&mut [Task], ()>("process_document_addition_batch").then(|_| ());
}
TaskContent::Dump { .. } => (),
_ => {
mocker.when::<&mut Task, ()>("process_task").then(|_| ());
}
}
let index_resolver: IndexResolver<HeedMetaStore, MapIndexStore> = IndexResolver::mock(mocker);
let batch = task_to_batch(task);
if index_resolver.accept(&batch) {
index_resolver.process_batch(batch).await;
}
});
if let Err(e) = rt.block_on(handle) {
if e.is_panic() {
std::panic::resume_unwind(e.into_panic());
}
}
}
}
}

View File

@@ -1,34 +0,0 @@
pub mod dump_handler;
pub mod empty_handler;
mod index_resolver_handler;
pub mod snapshot_handler;
#[cfg(test)]
mod test {
use time::OffsetDateTime;
use crate::tasks::{
batch::{Batch, BatchContent},
task::{Task, TaskContent},
};
pub fn task_to_batch(task: Task) -> Batch {
let content = match task.content {
TaskContent::DocumentAddition { .. } => {
BatchContent::DocumentsAdditionBatch(vec![task])
}
TaskContent::DocumentDeletion { .. }
| TaskContent::SettingsUpdate { .. }
| TaskContent::IndexDeletion { .. }
| TaskContent::IndexCreation { .. }
| TaskContent::IndexUpdate { .. } => BatchContent::IndexUpdate(task),
TaskContent::Dump { .. } => BatchContent::Dump(task),
};
Batch {
id: Some(1),
created_at: OffsetDateTime::now_utc(),
content,
}
}
}

View File

@@ -1,26 +0,0 @@
use crate::tasks::batch::{Batch, BatchContent};
use crate::tasks::BatchHandler;
pub struct SnapshotHandler;
#[async_trait::async_trait]
impl BatchHandler for SnapshotHandler {
fn accept(&self, batch: &Batch) -> bool {
matches!(batch.content, BatchContent::Snapshot(_))
}
async fn process_batch(&self, batch: Batch) -> Batch {
match batch.content {
BatchContent::Snapshot(job) => {
if let Err(e) = job.run().await {
log::error!("snapshot error: {e}");
}
}
_ => unreachable!(),
}
Batch::empty()
}
async fn finish(&self, _: &Batch) {}
}

View File

@@ -1,56 +0,0 @@
use async_trait::async_trait;
pub use handlers::empty_handler::EmptyBatchHandler;
pub use handlers::snapshot_handler::SnapshotHandler;
pub use scheduler::Scheduler;
pub use task_store::TaskFilter;
#[cfg(test)]
pub use task_store::test::MockTaskStore as TaskStore;
#[cfg(not(test))]
pub use task_store::TaskStore;
use batch::Batch;
use error::Result;
pub mod batch;
pub mod error;
mod handlers;
mod scheduler;
pub mod task;
mod task_store;
pub mod update_loop;
#[cfg_attr(test, mockall::automock(type Error=test::DebugError;))]
#[async_trait]
pub trait BatchHandler: Sync + Send + 'static {
/// return whether this handler can accept this batch
fn accept(&self, batch: &Batch) -> bool;
/// Processes the `Task` batch returning the batch with the `Task` updated.
///
/// It is ok for this function to panic if a batch is handed that hasn't been verified by
/// `accept` beforehand.
async fn process_batch(&self, batch: Batch) -> Batch;
/// `finish` is called when the result of `process` has been committed to the task store. This
/// method can be used to perform cleanup after the update has been completed for example.
async fn finish(&self, batch: &Batch);
}
#[cfg(test)]
mod test {
use serde::{Deserialize, Serialize};
use std::fmt::Display;
#[derive(Debug, Serialize, Deserialize)]
pub struct DebugError;
impl Display for DebugError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("an error")
}
}
impl std::error::Error for DebugError {}
}

View File

@@ -1,195 +0,0 @@
use meilisearch_types::error::ResponseError;
use meilisearch_types::index_uid::IndexUid;
use milli::update::{DocumentAdditionResult, IndexDocumentsMethod};
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use uuid::Uuid;
use super::batch::BatchId;
use crate::index::{Settings, Unchecked};
pub type TaskId = u32;
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub enum TaskResult {
DocumentAddition { indexed_documents: u64 },
DocumentDeletion { deleted_documents: u64 },
ClearAll { deleted_documents: u64 },
Other,
}
impl From<DocumentAdditionResult> for TaskResult {
fn from(other: DocumentAdditionResult) -> Self {
Self::DocumentAddition {
indexed_documents: other.indexed_documents,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub enum TaskEvent {
Created(
#[cfg_attr(test, proptest(strategy = "test::datetime_strategy()"))]
#[serde(with = "time::serde::rfc3339")]
OffsetDateTime,
),
Batched {
#[cfg_attr(test, proptest(strategy = "test::datetime_strategy()"))]
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
batch_id: BatchId,
},
Processing(
#[cfg_attr(test, proptest(strategy = "test::datetime_strategy()"))]
#[serde(with = "time::serde::rfc3339")]
OffsetDateTime,
),
Succeeded {
result: TaskResult,
#[cfg_attr(test, proptest(strategy = "test::datetime_strategy()"))]
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
},
Failed {
error: ResponseError,
#[cfg_attr(test, proptest(strategy = "test::datetime_strategy()"))]
#[serde(with = "time::serde::rfc3339")]
timestamp: OffsetDateTime,
},
}
impl TaskEvent {
pub fn succeeded(result: TaskResult) -> Self {
Self::Succeeded {
result,
timestamp: OffsetDateTime::now_utc(),
}
}
pub fn failed(error: impl Into<ResponseError>) -> Self {
Self::Failed {
error: error.into(),
timestamp: OffsetDateTime::now_utc(),
}
}
}
/// A task represents an operation that Meilisearch must do.
/// It's stored on disk and executed from the lowest to highest Task id.
/// Every time a new task is created it has a higher Task id than the previous one.
/// See also `Job`.
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub struct Task {
pub id: TaskId,
/// The name of the index the task is targeting. If it isn't targeting any index (i.e Dump task)
/// then this is None
// TODO: when next forward breaking dumps, it would be a good idea to move this field inside of
// the TaskContent.
pub content: TaskContent,
pub events: Vec<TaskEvent>,
}
impl Task {
/// Return true when a task is finished.
/// A task is finished when its last state is either `Succeeded` or `Failed`.
pub fn is_finished(&self) -> bool {
self.events.last().map_or(false, |event| {
matches!(
event,
TaskEvent::Succeeded { .. } | TaskEvent::Failed { .. }
)
})
}
/// Return the content_uuid of the `Task` if there is one.
pub fn get_content_uuid(&self) -> Option<Uuid> {
match self {
Task {
content: TaskContent::DocumentAddition { content_uuid, .. },
..
} => Some(*content_uuid),
_ => None,
}
}
pub fn index_uid(&self) -> Option<&str> {
match &self.content {
TaskContent::DocumentAddition { index_uid, .. }
| TaskContent::DocumentDeletion { index_uid, .. }
| TaskContent::SettingsUpdate { index_uid, .. }
| TaskContent::IndexDeletion { index_uid }
| TaskContent::IndexCreation { index_uid, .. }
| TaskContent::IndexUpdate { index_uid, .. } => Some(index_uid.as_str()),
TaskContent::Dump { .. } => None,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
pub enum DocumentDeletion {
Clear,
Ids(Vec<String>),
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[allow(clippy::large_enum_variant)]
pub enum TaskContent {
DocumentAddition {
index_uid: IndexUid,
#[cfg_attr(test, proptest(value = "Uuid::new_v4()"))]
content_uuid: Uuid,
#[cfg_attr(test, proptest(strategy = "test::index_document_method_strategy()"))]
merge_strategy: IndexDocumentsMethod,
primary_key: Option<String>,
documents_count: usize,
allow_index_creation: bool,
},
DocumentDeletion {
index_uid: IndexUid,
deletion: DocumentDeletion,
},
SettingsUpdate {
index_uid: IndexUid,
settings: Settings<Unchecked>,
/// Indicates whether the task was a deletion
is_deletion: bool,
allow_index_creation: bool,
},
IndexDeletion {
index_uid: IndexUid,
},
IndexCreation {
index_uid: IndexUid,
primary_key: Option<String>,
},
IndexUpdate {
index_uid: IndexUid,
primary_key: Option<String>,
},
Dump {
uid: String,
},
}
#[cfg(test)]
mod test {
use proptest::prelude::*;
use super::*;
pub(super) fn index_document_method_strategy() -> impl Strategy<Value = IndexDocumentsMethod> {
prop_oneof![
Just(IndexDocumentsMethod::ReplaceDocuments),
Just(IndexDocumentsMethod::UpdateDocuments),
]
}
pub(super) fn datetime_strategy() -> impl Strategy<Value = OffsetDateTime> {
Just(OffsetDateTime::now_utc())
}
}

View File

@@ -1,377 +0,0 @@
#[allow(clippy::upper_case_acronyms)]
type BEU32 = milli::heed::zerocopy::U32<milli::heed::byteorder::BE>;
const INDEX_UIDS_TASK_IDS: &str = "index-uids-task-ids";
const TASKS: &str = "tasks";
use std::collections::HashSet;
use std::ops::Bound::{Excluded, Unbounded};
use std::result::Result as StdResult;
use std::sync::Arc;
use milli::heed::types::{OwnedType, SerdeJson, Str};
use milli::heed::{Database, Env, RoTxn, RwTxn};
use milli::heed_codec::RoaringBitmapCodec;
use roaring::RoaringBitmap;
use crate::tasks::task::{Task, TaskId};
use super::super::Result;
use super::TaskFilter;
pub struct Store {
env: Arc<Env>,
/// Maps an index uid to the set of tasks ids associated to it.
index_uid_task_ids: Database<Str, RoaringBitmapCodec>,
tasks: Database<OwnedType<BEU32>, SerdeJson<Task>>,
}
impl Drop for Store {
fn drop(&mut self) {
if Arc::strong_count(&self.env) == 1 {
self.env.as_ref().clone().prepare_for_closing();
}
}
}
impl Store {
/// Create a new store from the specified `Path`.
/// Be really cautious when calling this function, the returned `Store` may
/// be in an invalid state, with dangling processing tasks.
/// You want to patch all un-finished tasks and put them in your pending
/// queue with the `reset_and_return_unfinished_update` method.
pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
let index_uid_task_ids = env.create_database(Some(INDEX_UIDS_TASK_IDS))?;
let tasks = env.create_database(Some(TASKS))?;
Ok(Self {
env,
index_uid_task_ids,
tasks,
})
}
pub fn wtxn(&self) -> Result<RwTxn> {
Ok(self.env.write_txn()?)
}
pub fn rtxn(&self) -> Result<RoTxn> {
Ok(self.env.read_txn()?)
}
/// Returns the id for the next task.
///
/// The required `mut txn` acts as a reservation system. It guarantees that as long as you commit
/// the task to the store in the same transaction, no one else will hav this task id.
pub fn next_task_id(&self, txn: &mut RwTxn) -> Result<TaskId> {
let id = self
.tasks
.lazily_decode_data()
.last(txn)?
.map(|(id, _)| id.get() + 1)
.unwrap_or(0);
Ok(id)
}
pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> {
self.tasks.put(txn, &BEU32::new(task.id), task)?;
// only add the task to the indexes index if it has an index_uid
if let Some(index_uid) = task.index_uid() {
let mut tasks_set = self
.index_uid_task_ids
.get(txn, index_uid)?
.unwrap_or_default();
tasks_set.insert(task.id);
self.index_uid_task_ids.put(txn, index_uid, &tasks_set)?;
}
Ok(())
}
pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> {
let task = self.tasks.get(txn, &BEU32::new(id))?;
Ok(task)
}
/// Returns the unfinished tasks starting from the given taskId in ascending order.
pub fn fetch_unfinished_tasks(&self, txn: &RoTxn, from: Option<TaskId>) -> Result<Vec<Task>> {
// We must NEVER re-enqueue an already processed task! It's content uuid would point to an unexisting file.
//
// TODO(marin): This may create some latency when the first batch lazy loads the pending updates.
let from = from.unwrap_or_default();
let result: StdResult<Vec<_>, milli::heed::Error> = self
.tasks
.range(txn, &(BEU32::new(from)..))?
.map(|r| r.map(|(_, t)| t))
.filter(|result| result.as_ref().map_or(true, |t| !t.is_finished()))
.collect();
result.map_err(Into::into)
}
/// Returns all the tasks starting from the given taskId and going in descending order.
pub fn list_tasks(
&self,
txn: &RoTxn,
from: Option<TaskId>,
filter: Option<TaskFilter>,
limit: Option<usize>,
) -> Result<Vec<Task>> {
let from = match from {
Some(from) => from,
None => self.tasks.last(txn)?.map_or(0, |(id, _)| id.get()),
};
let filter_fn = |task: &Task| {
filter
.as_ref()
.and_then(|f| f.filter_fn.as_ref())
.map_or(true, |f| f(task))
};
let result: Result<Vec<_>> = match filter.as_ref().and_then(|f| f.filtered_indexes()) {
Some(indexes) => self
.compute_candidates(txn, indexes, from)?
.filter(|result| result.as_ref().map_or(true, filter_fn))
.take(limit.unwrap_or(usize::MAX))
.collect(),
None => self
.tasks
.rev_range(txn, &(..=BEU32::new(from)))?
.map(|r| r.map(|(_, t)| t).map_err(Into::into))
.filter(|result| result.as_ref().map_or(true, filter_fn))
.take(limit.unwrap_or(usize::MAX))
.collect(),
};
result.map_err(Into::into)
}
fn compute_candidates<'a>(
&'a self,
txn: &'a RoTxn,
indexes: &HashSet<String>,
from: TaskId,
) -> Result<impl Iterator<Item = Result<Task>> + 'a> {
let mut candidates = RoaringBitmap::new();
for index_uid in indexes {
if let Some(tasks_set) = self.index_uid_task_ids.get(txn, index_uid)? {
candidates |= tasks_set;
}
}
candidates.remove_range((Excluded(from), Unbounded));
let iter = candidates
.into_iter()
.rev()
.filter_map(|id| self.get(txn, id).transpose());
Ok(iter)
}
}
#[cfg(test)]
pub mod test {
use itertools::Itertools;
use meilisearch_types::index_uid::IndexUid;
use milli::heed::EnvOpenOptions;
use nelson::Mocker;
use tempfile::TempDir;
use crate::tasks::task::TaskContent;
use super::*;
/// TODO: use this mock to test the task store properly.
#[allow(dead_code)]
pub enum MockStore {
Real(Store),
Fake(Mocker),
}
pub struct TmpEnv(TempDir, Arc<milli::heed::Env>);
impl TmpEnv {
pub fn env(&self) -> Arc<milli::heed::Env> {
self.1.clone()
}
}
pub fn tmp_env() -> TmpEnv {
let tmp = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(4096 * 100000);
options.max_dbs(1000);
let env = Arc::new(options.open(tmp.path()).unwrap());
TmpEnv(tmp, env)
}
impl MockStore {
pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
Ok(Self::Real(Store::new(env)?))
}
pub fn wtxn(&self) -> Result<RwTxn> {
match self {
MockStore::Real(index) => index.wtxn(),
MockStore::Fake(_) => todo!(),
}
}
pub fn rtxn(&self) -> Result<RoTxn> {
match self {
MockStore::Real(index) => index.rtxn(),
MockStore::Fake(_) => todo!(),
}
}
pub fn next_task_id(&self, txn: &mut RwTxn) -> Result<TaskId> {
match self {
MockStore::Real(index) => index.next_task_id(txn),
MockStore::Fake(_) => todo!(),
}
}
pub fn put(&self, txn: &mut RwTxn, task: &Task) -> Result<()> {
match self {
MockStore::Real(index) => index.put(txn, task),
MockStore::Fake(_) => todo!(),
}
}
pub fn get(&self, txn: &RoTxn, id: TaskId) -> Result<Option<Task>> {
match self {
MockStore::Real(index) => index.get(txn, id),
MockStore::Fake(_) => todo!(),
}
}
pub fn fetch_unfinished_tasks(
&self,
txn: &RoTxn,
from: Option<TaskId>,
) -> Result<Vec<Task>> {
match self {
MockStore::Real(index) => index.fetch_unfinished_tasks(txn, from),
MockStore::Fake(_) => todo!(),
}
}
pub fn list_tasks(
&self,
txn: &RoTxn,
from: Option<TaskId>,
filter: Option<TaskFilter>,
limit: Option<usize>,
) -> Result<Vec<Task>> {
match self {
MockStore::Real(index) => index.list_tasks(txn, from, filter, limit),
MockStore::Fake(_) => todo!(),
}
}
}
#[test]
fn test_ordered_filtered_updates() {
let tmp = tmp_env();
let store = Store::new(tmp.env()).unwrap();
let tasks = (0..100)
.map(|_| Task {
id: rand::random(),
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test"),
},
events: vec![],
})
.collect::<Vec<_>>();
let mut txn = store.env.write_txn().unwrap();
tasks
.iter()
.try_for_each(|t| store.put(&mut txn, t))
.unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
assert!(tasks
.iter()
.map(|t| t.id)
.tuple_windows()
.all(|(a, b)| a > b));
}
#[test]
fn test_filter_same_index_prefix() {
let tmp = tmp_env();
let store = Store::new(tmp.env()).unwrap();
let task_1 = Task {
id: 1,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test"),
},
events: vec![],
};
let task_2 = Task {
id: 0,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test1"),
},
events: vec![],
};
let mut txn = store.wtxn().unwrap();
store.put(&mut txn, &task_1).unwrap();
store.put(&mut txn, &task_2).unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
txn.abort().unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(tasks.first().as_ref().unwrap().index_uid().unwrap(), "test");
// same thing but invert the ids
let task_1 = Task {
id: 0,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test"),
},
events: vec![],
};
let task_2 = Task {
id: 1,
content: TaskContent::IndexDeletion {
index_uid: IndexUid::new_unchecked("test1"),
},
events: vec![],
};
let mut txn = store.wtxn().unwrap();
store.put(&mut txn, &task_1).unwrap();
store.put(&mut txn, &task_2).unwrap();
let mut filter = TaskFilter::default();
filter.filter_index("test".into());
let tasks = store.list_tasks(&txn, None, Some(filter), None).unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(tasks.first().as_ref().unwrap().index_uid().unwrap(), "test");
}
}

View File

@@ -1,93 +0,0 @@
use std::sync::Arc;
use time::OffsetDateTime;
use tokio::sync::{watch, RwLock};
use super::batch::Batch;
use super::error::Result;
use super::{BatchHandler, Scheduler};
use crate::tasks::task::TaskEvent;
/// The update loop sequentially performs batches of updates by asking the scheduler for a batch,
/// and handing it to the `TaskPerformer`.
pub struct UpdateLoop {
scheduler: Arc<RwLock<Scheduler>>,
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
notifier: Option<watch::Receiver<()>>,
}
impl UpdateLoop {
pub fn new(
scheduler: Arc<RwLock<Scheduler>>,
performers: Vec<Arc<dyn BatchHandler + Send + Sync + 'static>>,
notifier: watch::Receiver<()>,
) -> Self {
Self {
scheduler,
performers,
notifier: Some(notifier),
}
}
pub async fn run(mut self) {
let mut notifier = self.notifier.take().unwrap();
loop {
if notifier.changed().await.is_err() {
break;
}
if let Err(e) = self.process_next_batch().await {
log::error!("an error occurred while processing an update batch: {}", e);
}
}
}
async fn process_next_batch(&self) -> Result<()> {
let mut batch = { self.scheduler.write().await.prepare().await? };
let performer = self
.performers
.iter()
.find(|p| p.accept(&batch))
.expect("No performer found for batch")
.clone();
batch
.content
.push_event(TaskEvent::Processing(OffsetDateTime::now_utc()));
batch.content = {
self.scheduler
.read()
.await
.update_tasks(batch.content)
.await?
};
let batch = performer.process_batch(batch).await;
self.handle_batch_result(batch, performer).await?;
Ok(())
}
/// Handles the result from a processed batch.
///
/// When a task is processed, the result of the process is pushed to its event list. The
/// `handle_batch_result` make sure that the new state is saved to the store.
/// The tasks are then removed from the processing queue.
async fn handle_batch_result(
&self,
mut batch: Batch,
performer: Arc<dyn BatchHandler + Sync + Send + 'static>,
) -> Result<()> {
let mut scheduler = self.scheduler.write().await;
let content = scheduler.update_tasks(batch.content).await?;
scheduler.finish();
drop(scheduler);
batch.content = content;
performer.finish(&batch).await;
Ok(())
}
}

View File

@@ -1,258 +0,0 @@
use std::fs::{create_dir_all, File};
use std::io::{self, BufReader, BufWriter, Write};
use std::ops::{Deref, DerefMut};
use std::path::{Path, PathBuf};
use milli::documents::DocumentsBatchReader;
use serde_json::Map;
use tempfile::{NamedTempFile, PersistError};
use uuid::Uuid;
#[cfg(not(test))]
pub use store::UpdateFileStore;
#[cfg(test)]
pub use test::MockUpdateFileStore as UpdateFileStore;
const UPDATE_FILES_PATH: &str = "updates/updates_files";
use crate::document_formats::read_ndjson;
pub struct UpdateFile {
path: PathBuf,
file: NamedTempFile,
}
#[derive(Debug, thiserror::Error)]
#[error("Error while persisting update to disk: {0}")]
pub struct UpdateFileStoreError(Box<dyn std::error::Error + Sync + Send + 'static>);
pub type Result<T> = std::result::Result<T, UpdateFileStoreError>;
macro_rules! into_update_store_error {
($($other:path),*) => {
$(
impl From<$other> for UpdateFileStoreError {
fn from(other: $other) -> Self {
Self(Box::new(other))
}
}
)*
};
}
into_update_store_error!(
PersistError,
io::Error,
serde_json::Error,
milli::documents::Error,
milli::documents::DocumentsBatchCursorError
);
impl UpdateFile {
pub fn persist(self) -> Result<()> {
self.file.persist(&self.path)?;
Ok(())
}
}
impl Deref for UpdateFile {
type Target = NamedTempFile;
fn deref(&self) -> &Self::Target {
&self.file
}
}
impl DerefMut for UpdateFile {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.file
}
}
mod store {
use super::*;
#[derive(Clone, Debug)]
pub struct UpdateFileStore {
path: PathBuf,
}
impl UpdateFileStore {
pub fn load_dump(src: impl AsRef<Path>, dst: impl AsRef<Path>) -> anyhow::Result<()> {
let src_update_files_path = src.as_ref().join(UPDATE_FILES_PATH);
let dst_update_files_path = dst.as_ref().join(UPDATE_FILES_PATH);
// No update files to load
if !src_update_files_path.exists() {
return Ok(());
}
create_dir_all(&dst_update_files_path)?;
let entries = std::fs::read_dir(src_update_files_path)?;
for entry in entries {
let entry = entry?;
let update_file = BufReader::new(File::open(entry.path())?);
let file_uuid = entry.file_name();
let file_uuid = file_uuid
.to_str()
.ok_or_else(|| anyhow::anyhow!("invalid update file name"))?;
let dst_path = dst_update_files_path.join(file_uuid);
let dst_file = BufWriter::new(File::create(dst_path)?);
read_ndjson(update_file, dst_file)?;
}
Ok(())
}
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
let path = path.as_ref().join(UPDATE_FILES_PATH);
std::fs::create_dir_all(&path)?;
Ok(Self { path })
}
/// Creates a new temporary update file.
/// A call to `persist` is needed to persist the file in the database.
pub fn new_update(&self) -> Result<(Uuid, UpdateFile)> {
let file = NamedTempFile::new_in(&self.path)?;
let uuid = Uuid::new_v4();
let path = self.path.join(uuid.to_string());
let update_file = UpdateFile { file, path };
Ok((uuid, update_file))
}
/// Returns the file corresponding to the requested uuid.
pub fn get_update(&self, uuid: Uuid) -> Result<File> {
let path = self.path.join(uuid.to_string());
let file = File::open(path)?;
Ok(file)
}
/// Copies the content of the update file pointed to by `uuid` to the `dst` directory.
pub fn snapshot(&self, uuid: Uuid, dst: impl AsRef<Path>) -> Result<()> {
let src = self.path.join(uuid.to_string());
let mut dst = dst.as_ref().join(UPDATE_FILES_PATH);
std::fs::create_dir_all(&dst)?;
dst.push(uuid.to_string());
std::fs::copy(src, dst)?;
Ok(())
}
/// Peforms a dump of the given update file uuid into the provided dump path.
pub fn dump(&self, uuid: Uuid, dump_path: impl AsRef<Path>) -> Result<()> {
let uuid_string = uuid.to_string();
let update_file_path = self.path.join(&uuid_string);
let mut dst = dump_path.as_ref().join(UPDATE_FILES_PATH);
std::fs::create_dir_all(&dst)?;
dst.push(&uuid_string);
let update_file = File::open(update_file_path)?;
let mut dst_file = NamedTempFile::new_in(&dump_path)?;
let (mut document_cursor, index) =
DocumentsBatchReader::from_reader(update_file)?.into_cursor_and_fields_index();
let mut document_buffer = Map::new();
// TODO: we need to find a way to do this more efficiently. (create a custom serializer
// for jsonl for example...)
while let Some(document) = document_cursor.next_document()? {
for (field_id, content) in document.iter() {
if let Some(field_name) = index.name(field_id) {
let content = serde_json::from_slice(content)?;
document_buffer.insert(field_name.to_string(), content);
}
}
serde_json::to_writer(&mut dst_file, &document_buffer)?;
dst_file.write_all(b"\n")?;
document_buffer.clear();
}
dst_file.persist(dst)?;
Ok(())
}
pub fn get_size(&self, uuid: Uuid) -> Result<u64> {
Ok(self.get_update(uuid)?.metadata()?.len())
}
pub async fn delete(&self, uuid: Uuid) -> Result<()> {
let path = self.path.join(uuid.to_string());
tokio::fs::remove_file(path).await?;
Ok(())
}
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use nelson::Mocker;
use super::*;
#[derive(Clone)]
pub enum MockUpdateFileStore {
Real(store::UpdateFileStore),
Mock(Arc<Mocker>),
}
impl MockUpdateFileStore {
pub fn mock(mocker: Mocker) -> Self {
Self::Mock(Arc::new(mocker))
}
pub fn load_dump(src: impl AsRef<Path>, dst: impl AsRef<Path>) -> anyhow::Result<()> {
store::UpdateFileStore::load_dump(src, dst)
}
pub fn new(path: impl AsRef<Path>) -> Result<Self> {
store::UpdateFileStore::new(path).map(Self::Real)
}
pub fn new_update(&self) -> Result<(Uuid, UpdateFile)> {
match self {
MockUpdateFileStore::Real(s) => s.new_update(),
MockUpdateFileStore::Mock(_) => todo!(),
}
}
pub fn get_update(&self, uuid: Uuid) -> Result<File> {
match self {
MockUpdateFileStore::Real(s) => s.get_update(uuid),
MockUpdateFileStore::Mock(_) => todo!(),
}
}
pub fn snapshot(&self, uuid: Uuid, dst: impl AsRef<Path>) -> Result<()> {
match self {
MockUpdateFileStore::Real(s) => s.snapshot(uuid, dst),
MockUpdateFileStore::Mock(_) => todo!(),
}
}
pub fn dump(&self, uuid: Uuid, dump_path: impl AsRef<Path>) -> Result<()> {
match self {
MockUpdateFileStore::Real(s) => s.dump(uuid, dump_path),
MockUpdateFileStore::Mock(_) => todo!(),
}
}
pub fn get_size(&self, uuid: Uuid) -> Result<u64> {
match self {
MockUpdateFileStore::Real(s) => s.get_size(uuid),
MockUpdateFileStore::Mock(_) => todo!(),
}
}
pub async fn delete(&self, uuid: Uuid) -> Result<()> {
match self {
MockUpdateFileStore::Real(s) => s.delete(uuid).await,
MockUpdateFileStore::Mock(mocker) => unsafe { mocker.get("delete").call(uuid) },
}
}
}
}