Merge branch 'release-v0.30.0' into stable

Merge #3053
3053: Upgrade alpine 3.14 to 3.16 r=Kerollmops a=curquiza Otherwise CI is failing https://github.com/meilisearch/meilisearch/actions/runs/3470576605/jobs/5799173168 Co-authored-by: curquiza <clementine@meilisearch.com>
2025-07-22 14:21:03 +00:00 · 2022-11-28 13:56:35 +01:00 · 2022-11-15 13:56:45 +00:00 · 2022-11-15 14:54:18 +01:00 · 2022-11-15 11:08:47 +00:00 · 2022-11-15 12:07:00 +01:00
3 changed files with 1095 additions and 0 deletions
--- a/meilisearch-lib/Cargo.toml
+++ b/meilisearch-lib/Cargo.toml
@ -0,0 +1,66 @@
+[package]
+name = "meilisearch-lib"
+version = "0.29.2"
+edition = "2021"
+
+[dependencies]
+actix-web = { version = "4.0.1", default-features = false }
+anyhow = { version = "1.0.62", features = ["backtrace"] }
+async-stream = "0.3.3"
+async-trait = "0.1.52"
+atomic_refcell = "0.1.8"
+byte-unit = { version = "4.0.14", default-features = false, features = ["std", "serde"] }
+bytes = "1.1.0"
+clap = { version = "3.1.6", features = ["derive", "env"] }
+crossbeam-channel = "0.5.2"
+csv = "1.1.6"
+derivative = "2.2.0"
+either = { version = "1.6.1", features = ["serde"] }
+flate2 = "1.0.22"
+fs_extra = "1.2.0"
+fst = "0.4.7"
+futures = "0.3.21"
+futures-util = "0.3.21"
+http = "0.2.6"
+indexmap = { version = "1.8.0", features = ["serde-1"] }
+itertools = "0.10.3"
+lazy_static = "1.4.0"
+log = "0.4.14"
+meilisearch-auth = { path = "../meilisearch-auth" }
+meilisearch-types = { path = "../meilisearch-types" }
+milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.33.5" }
+mime = "0.3.16"
+num_cpus = "1.13.1"
+obkv = "0.2.0"
+once_cell = "1.10.0"
+page_size = "0.4.2"
+parking_lot = "0.12.0"
+permissive-json-pointer = { path = "../permissive-json-pointer" }
+rand = "0.8.5"
+rayon = "1.5.1"
+regex = "1.5.5"
+reqwest = { version = "0.11.9", features = ["json", "rustls-tls"], default-features = false, optional = true }
+roaring = "0.9.0"
+rustls = "0.20.4"
+serde = { version = "1.0.136", features = ["derive"] }
+serde_json = { version = "1.0.85", features = ["preserve_order"] }
+siphasher = "0.3.10"
+slice-group-by = "0.3.0"
+sysinfo = "0.23.5"
+tar = "0.4.38"
+tempfile = "3.3.0"
+thiserror = "1.0.30"
+time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] }
+tokio = { version = "1.17.0", features = ["full"] }
+uuid = { version = "1.1.2", features = ["serde", "v4"] }
+walkdir = "2.3.2"
+whoami = { version = "1.2.1", optional = true }
+
+[dev-dependencies]
+actix-rt = "2.7.0"
+meilisearch-types = { path = "../meilisearch-types", features = ["test-traits"] }
+mockall = "0.11.0"
+nelson = { git = "https://github.com/meilisearch/nelson.git", rev = "675f13885548fb415ead8fbb447e9e6d9314000a"}
+paste = "1.0.6"
+proptest = "1.0.0"
+proptest-derive = "0.3.0"
--- a/meilisearch-lib/src/tasks/scheduler.rs
+++ b/meilisearch-lib/src/tasks/scheduler.rs
@ -0,0 +1,609 @@
+use std::cmp::Ordering;
+use std::collections::{hash_map::Entry, BinaryHeap, HashMap, VecDeque};
+use std::ops::{Deref, DerefMut};
+use std::slice;
+use std::sync::Arc;
+
+use atomic_refcell::AtomicRefCell;
+use milli::update::IndexDocumentsMethod;
+use time::OffsetDateTime;
+use tokio::sync::{watch, RwLock};
+
+use crate::options::SchedulerConfig;
+use crate::snapshot::SnapshotJob;
+
+use super::batch::{Batch, BatchContent};
+use super::error::Result;
+use super::task::{Task, TaskContent, TaskEvent, TaskId};
+use super::update_loop::UpdateLoop;
+use super::{BatchHandler, TaskFilter, TaskStore};
+
+#[derive(Eq, Debug, Clone, Copy)]
+enum TaskType {
+    DocumentAddition { number: usize },
+    DocumentUpdate { number: usize },
+    IndexUpdate,
+    Dump,
+}
+
+/// Two tasks are equal if they have the same type.
+impl PartialEq for TaskType {
+    fn eq(&self, other: &Self) -> bool {
+        matches!(
+            (self, other),
+            (Self::DocumentAddition { .. }, Self::DocumentAddition { .. })
+                | (Self::DocumentUpdate { .. }, Self::DocumentUpdate { .. })
+        )
+    }
+}
+
+#[derive(Eq, Debug, Clone, Copy)]
+struct PendingTask {
+    kind: TaskType,
+    id: TaskId,
+}
+
+impl PartialEq for PendingTask {
+    fn eq(&self, other: &Self) -> bool {
+        self.id.eq(&other.id)
+    }
+}
+
+impl PartialOrd for PendingTask {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for PendingTask {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.id.cmp(&other.id).reverse()
+    }
+}
+
+#[derive(Debug)]
+struct TaskList {
+    id: TaskListIdentifier,
+    tasks: BinaryHeap<PendingTask>,
+}
+
+impl Deref for TaskList {
+    type Target = BinaryHeap<PendingTask>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.tasks
+    }
+}
+
+impl DerefMut for TaskList {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.tasks
+    }
+}
+
+impl TaskList {
+    fn new(id: TaskListIdentifier) -> Self {
+        Self {
+            id,
+            tasks: Default::default(),
+        }
+    }
+}
+
+impl PartialEq for TaskList {
+    fn eq(&self, other: &Self) -> bool {
+        self.id == other.id
+    }
+}
+
+impl Eq for TaskList {}
+
+impl Ord for TaskList {
+    fn cmp(&self, other: &Self) -> Ordering {
+        match (&self.id, &other.id) {
+            (TaskListIdentifier::Index(_), TaskListIdentifier::Index(_)) => {
+                match (self.peek(), other.peek()) {
+                    (None, None) => Ordering::Equal,
+                    (None, Some(_)) => Ordering::Less,
+                    (Some(_), None) => Ordering::Greater,
+                    (Some(lhs), Some(rhs)) => lhs.cmp(rhs),
+                }
+            }
+            (TaskListIdentifier::Index(_), TaskListIdentifier::Dump) => Ordering::Less,
+            (TaskListIdentifier::Dump, TaskListIdentifier::Index(_)) => Ordering::Greater,
+            (TaskListIdentifier::Dump, TaskListIdentifier::Dump) => {
+                unreachable!("There should be only one Dump task list")
+            }
+        }
+    }
+}
+
+impl PartialOrd for TaskList {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+#[derive(PartialEq, Eq, Hash, Debug, Clone)]
+enum TaskListIdentifier {
+    Index(String),
+    Dump,
+}
+
+impl From<&Task> for TaskListIdentifier {
+    fn from(task: &Task) -> Self {
+        match &task.content {
+            TaskContent::DocumentAddition { index_uid, .. }
+            | TaskContent::DocumentDeletion { index_uid, .. }
+            | TaskContent::SettingsUpdate { index_uid, .. }
+            | TaskContent::IndexDeletion { index_uid }
+            | TaskContent::IndexCreation { index_uid, .. }
+            | TaskContent::IndexUpdate { index_uid, .. } => {
+                TaskListIdentifier::Index(index_uid.as_str().to_string())
+            }
+            TaskContent::Dump { .. } => TaskListIdentifier::Dump,
+        }
+    }
+}
+
+#[derive(Default)]
+struct TaskQueue {
+    /// Maps index uids to their TaskList, for quick access
+    index_tasks: HashMap<TaskListIdentifier, Arc<AtomicRefCell<TaskList>>>,
+    /// A queue that orders TaskList by the priority of their fist update
+    queue: BinaryHeap<Arc<AtomicRefCell<TaskList>>>,
+}
+
+impl TaskQueue {
+    fn insert(&mut self, task: Task) {
+        let id = task.id;
+        let uid = TaskListIdentifier::from(&task);
+
+        let kind = match task.content {
+            TaskContent::DocumentAddition {
+                documents_count,
+                merge_strategy: IndexDocumentsMethod::ReplaceDocuments,
+                ..
+            } => TaskType::DocumentAddition {
+                number: documents_count,
+            },
+            TaskContent::DocumentAddition {
+                documents_count,
+                merge_strategy: IndexDocumentsMethod::UpdateDocuments,
+                ..
+            } => TaskType::DocumentUpdate {
+                number: documents_count,
+            },
+            TaskContent::Dump { .. } => TaskType::Dump,
+            TaskContent::DocumentDeletion { .. }
+            | TaskContent::SettingsUpdate { .. }
+            | TaskContent::IndexDeletion { .. }
+            | TaskContent::IndexCreation { .. }
+            | TaskContent::IndexUpdate { .. } => TaskType::IndexUpdate,
+            _ => unreachable!("unhandled task type"),
+        };
+        let task = PendingTask { kind, id };
+
+        match self.index_tasks.entry(uid) {
+            Entry::Occupied(entry) => {
+                // A task list already exists for this index, all we have to to is to push the new
+                // update to the end of the list. This won't change the order since ids are
+                // monotonically increasing.
+                let mut list = entry.get().borrow_mut();
+
+                // We only need the first element to be lower than the one we want to
+                // insert to preserve the order in the queue.
+                assert!(list.peek().map(|old_id| id >= old_id.id).unwrap_or(true));
+
+                list.push(task);
+            }
+            Entry::Vacant(entry) => {
+                let mut task_list = TaskList::new(entry.key().clone());
+                task_list.push(task);
+                let task_list = Arc::new(AtomicRefCell::new(task_list));
+                entry.insert(task_list.clone());
+                self.queue.push(task_list);
+            }
+        }
+    }
+
+    /// Passes a context with a view to the task list of the next index to schedule. It is
+    /// guaranteed that the first id from task list will be the lowest pending task id.
+    fn head_mut<R>(&mut self, mut f: impl FnMut(&mut TaskList) -> R) -> Option<R> {
+        let head = self.queue.pop()?;
+        let result = {
+            let mut ref_head = head.borrow_mut();
+            f(&mut ref_head)
+        };
+        if !head.borrow().tasks.is_empty() {
+            // After being mutated, the head is reinserted to the correct position.
+            self.queue.push(head);
+        } else {
+            self.index_tasks.remove(&head.borrow().id);
+        }
+
+        Some(result)
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.queue.is_empty() && self.index_tasks.is_empty()
+    }
+}
+
+pub struct Scheduler {
+    // TODO: currently snapshots are non persistent tasks, and are treated differently.
+    snapshots: VecDeque<SnapshotJob>,
+    tasks: TaskQueue,
+
+    store: TaskStore,
+    processing: Processing,
+    next_fetched_task_id: TaskId,
+    config: SchedulerConfig,
+    /// Notifies the update loop that a new task was received
+    notifier: watch::Sender<()>,
+}
+
+impl Scheduler {
+    pub fn new(
+        store: TaskStore,
+        performers: Vec<Arc<dyn BatchHandler + Sync + Send + 'static>>,
+        config: SchedulerConfig,
+    ) -> Result<Arc<RwLock<Self>>> {
+        let (notifier, rcv) = watch::channel(());
+
+        let this = Self {
+            snapshots: VecDeque::new(),
+            tasks: TaskQueue::default(),
+
+            store,
+            processing: Processing::Nothing,
+            next_fetched_task_id: 0,
+            config,
+            notifier,
+        };
+
+        // Notify update loop to start processing pending updates immediately after startup.
+        this.notify();
+
+        let this = Arc::new(RwLock::new(this));
+
+        let update_loop = UpdateLoop::new(this.clone(), performers, rcv);
+
+        tokio::task::spawn_local(update_loop.run());
+
+        Ok(this)
+    }
+
+    fn register_task(&mut self, task: Task) {
+        assert!(!task.is_finished());
+        self.tasks.insert(task);
+    }
+
+    /// Clears the processing list, this method should be called when the processing of a batch is finished.
+    pub fn finish(&mut self) {
+        self.processing = Processing::Nothing;
+    }
+
+    pub fn notify(&self) {
+        let _ = self.notifier.send(());
+    }
+
+    fn notify_if_not_empty(&self) {
+        if !self.snapshots.is_empty() || !self.tasks.is_empty() {
+            self.notify();
+        }
+    }
+
+    pub async fn update_tasks(&self, content: BatchContent) -> Result<BatchContent> {
+        match content {
+            BatchContent::DocumentsAdditionBatch(tasks) => {
+                let tasks = self.store.update_tasks(tasks).await?;
+                Ok(BatchContent::DocumentsAdditionBatch(tasks))
+            }
+            BatchContent::IndexUpdate(t) => {
+                let mut tasks = self.store.update_tasks(vec![t]).await?;
+                Ok(BatchContent::IndexUpdate(tasks.remove(0)))
+            }
+            BatchContent::Dump(t) => {
+                let mut tasks = self.store.update_tasks(vec![t]).await?;
+                Ok(BatchContent::Dump(tasks.remove(0)))
+            }
+            other => Ok(other),
+        }
+    }
+
+    pub async fn get_task(&self, id: TaskId, filter: Option<TaskFilter>) -> Result<Task> {
+        self.store.get_task(id, filter).await
+    }
+
+    pub async fn list_tasks(
+        &self,
+        offset: Option<TaskId>,
+        filter: Option<TaskFilter>,
+        limit: Option<usize>,
+    ) -> Result<Vec<Task>> {
+        self.store.list_tasks(offset, filter, limit).await
+    }
+
+    pub async fn get_processing_tasks(&self) -> Result<Vec<Task>> {
+        let mut tasks = Vec::new();
+
+        for id in self.processing.ids() {
+            let task = self.store.get_task(id, None).await?;
+            tasks.push(task);
+        }
+
+        Ok(tasks)
+    }
+
+    pub fn schedule_snapshot(&mut self, job: SnapshotJob) {
+        self.snapshots.push_back(job);
+        self.notify();
+    }
+
+    async fn fetch_pending_tasks(&mut self) -> Result<()> {
+        self.store
+            .fetch_unfinished_tasks(Some(self.next_fetched_task_id))
+            .await?
+            .into_iter()
+            .for_each(|t| {
+                self.next_fetched_task_id = t.id + 1;
+                self.register_task(t);
+            });
+
+        Ok(())
+    }
+
+    /// Prepare the next batch, and set `processing` to the ids in that batch.
+    pub async fn prepare(&mut self) -> Result<Batch> {
+        // If there is a job to process, do it first.
+        if let Some(job) = self.snapshots.pop_front() {
+            // There is more work to do, notify the update loop
+            self.notify_if_not_empty();
+            let batch = Batch::new(None, BatchContent::Snapshot(job));
+            return Ok(batch);
+        }
+
+        // Try to fill the queue with pending tasks.
+        self.fetch_pending_tasks().await?;
+
+        self.processing = make_batch(&mut self.tasks, &self.config);
+
+        log::debug!("prepared batch with {} tasks", self.processing.len());
+
+        if !self.processing.is_nothing() {
+            let (processing, mut content) = self
+                .store
+                .get_processing_tasks(std::mem::take(&mut self.processing))
+                .await?;
+
+            // The batch id is the id of the first update it contains. At this point we must have a
+            // valid batch that contains at least 1 task.
+            let id = match content.first() {
+                Some(Task { id, .. }) => *id,
+                _ => panic!("invalid batch"),
+            };
+
+            content.push_event(TaskEvent::Batched {
+                batch_id: id,
+                timestamp: OffsetDateTime::now_utc(),
+            });
+
+            self.processing = processing;
+
+            let batch = Batch::new(Some(id), content);
+
+            // There is more work to do, notify the update loop
+            self.notify_if_not_empty();
+
+            Ok(batch)
+        } else {
+            Ok(Batch::empty())
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum Processing {
+    DocumentAdditions(Vec<TaskId>),
+    IndexUpdate(TaskId),
+    Dump(TaskId),
+    /// Variant used when there is nothing to process.
+    Nothing,
+}
+
+impl Default for Processing {
+    fn default() -> Self {
+        Self::Nothing
+    }
+}
+
+enum ProcessingIter<'a> {
+    Many(slice::Iter<'a, TaskId>),
+    Single(Option<TaskId>),
+}
+
+impl<'a> Iterator for ProcessingIter<'a> {
+    type Item = TaskId;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            ProcessingIter::Many(iter) => iter.next().copied(),
+            ProcessingIter::Single(val) => val.take(),
+        }
+    }
+}
+
+impl Processing {
+    fn is_nothing(&self) -> bool {
+        matches!(self, Processing::Nothing)
+    }
+
+    pub fn ids(&self) -> impl Iterator<Item = TaskId> + '_ {
+        match self {
+            Processing::DocumentAdditions(v) => ProcessingIter::Many(v.iter()),
+            Processing::IndexUpdate(id) | Processing::Dump(id) => ProcessingIter::Single(Some(*id)),
+            Processing::Nothing => ProcessingIter::Single(None),
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        match self {
+            Processing::DocumentAdditions(v) => v.len(),
+            Processing::IndexUpdate(_) | Processing::Dump(_) => 1,
+            Processing::Nothing => 0,
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+fn make_batch(tasks: &mut TaskQueue, config: &SchedulerConfig) -> Processing {
+    let mut doc_count = 0;
+    tasks
+        .head_mut(|list| match list.peek().copied() {
+            Some(PendingTask {
+                kind: TaskType::IndexUpdate,
+                id,
+            }) => {
+                list.pop();
+                Processing::IndexUpdate(id)
+            }
+            Some(PendingTask {
+                kind: TaskType::Dump,
+                id,
+            }) => {
+                list.pop();
+                Processing::Dump(id)
+            }
+            Some(PendingTask { kind, .. }) => {
+                let mut task_list = Vec::new();
+                loop {
+                    match list.peek() {
+                        Some(pending) if pending.kind == kind => {
+                            // We always need to process at least one task for the scheduler to make progress.
+                            if config.disable_auto_batching && !task_list.is_empty() {
+                                break;
+                            }
+                            let pending = list.pop().unwrap();
+                            task_list.push(pending.id);
+
+                            // We add the number of documents to the count if we are scheduling document additions.
+                            match pending.kind {
+                                TaskType::DocumentUpdate { number }
+                                | TaskType::DocumentAddition { number } => {
+                                    doc_count += number;
+                                }
+                                _ => (),
+                            }
+                        }
+                        _ => break,
+                    }
+                }
+                Processing::DocumentAdditions(task_list)
+            }
+            None => Processing::Nothing,
+        })
+        .unwrap_or(Processing::Nothing)
+}
+
+#[cfg(test)]
+mod test {
+    use meilisearch_types::index_uid::IndexUid;
+    use milli::update::IndexDocumentsMethod;
+    use uuid::Uuid;
+
+    use crate::tasks::task::TaskContent;
+
+    use super::*;
+
+    fn gen_task(id: TaskId, content: TaskContent) -> Task {
+        Task {
+            id,
+            content,
+            events: vec![],
+        }
+    }
+
+    #[test]
+    #[rustfmt::skip]
+    fn register_updates_multiples_indexes() {
+        let mut queue = TaskQueue::default();
+        queue.insert(gen_task(0, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test1") }));
+        queue.insert(gen_task(1, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test2") }));
+        queue.insert(gen_task(2, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test2") }));
+        queue.insert(gen_task(3, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test2") }));
+        queue.insert(gen_task(4, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test1") }));
+        queue.insert(gen_task(5, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test1") }));
+        queue.insert(gen_task(6, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test2") }));
+
+        let test1_tasks = queue
+            .head_mut(|tasks| tasks.drain().map(|t| t.id).collect::<Vec<_>>())
+            .unwrap();
+
+        assert_eq!(test1_tasks, &[0, 4, 5]);
+
+        let test2_tasks = queue
+            .head_mut(|tasks| tasks.drain().map(|t| t.id).collect::<Vec<_>>())
+            .unwrap();
+
+        assert_eq!(test2_tasks, &[1, 2, 3, 6]);
+
+        assert!(queue.index_tasks.is_empty());
+        assert!(queue.queue.is_empty());
+    }
+
+    fn gen_doc_addition_task_content(index_uid: &str) -> TaskContent {
+        TaskContent::DocumentAddition {
+            content_uuid: Uuid::new_v4(),
+            merge_strategy: IndexDocumentsMethod::ReplaceDocuments,
+            primary_key: Some("test".to_string()),
+            documents_count: 0,
+            allow_index_creation: true,
+            index_uid: IndexUid::new_unchecked(index_uid),
+        }
+    }
+
+    #[test]
+    #[rustfmt::skip]
+    fn test_make_batch() {
+        let mut queue = TaskQueue::default();
+        queue.insert(gen_task(0, gen_doc_addition_task_content("test1")));
+        queue.insert(gen_task(1, gen_doc_addition_task_content("test2")));
+        queue.insert(gen_task(2, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test2")}));
+        queue.insert(gen_task(3, gen_doc_addition_task_content("test2")));
+        queue.insert(gen_task(4, gen_doc_addition_task_content("test1")));
+        queue.insert(gen_task(5, TaskContent::IndexDeletion { index_uid: IndexUid::new_unchecked("test1")}));
+        queue.insert(gen_task(6, gen_doc_addition_task_content("test2")));
+        queue.insert(gen_task(7, gen_doc_addition_task_content("test1")));
+        queue.insert(gen_task(8, TaskContent::Dump { uid: "adump".to_owned() }));
+
+        let config = SchedulerConfig::default();
+
+        // Make sure that the dump is processed before everybody else.
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::Dump(8));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::DocumentAdditions(vec![0, 4]));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::DocumentAdditions(vec![1]));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::IndexUpdate(2));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::DocumentAdditions(vec![3, 6]));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::IndexUpdate(5));
+
+        let batch = make_batch(&mut queue, &config);
+        assert_eq!(batch, Processing::DocumentAdditions(vec![7]));
+
+        assert!(queue.is_empty());
+    }
+}
--- a/meilisearch-lib/src/tasks/task_store/mod.rs
+++ b/meilisearch-lib/src/tasks/task_store/mod.rs
@ -0,0 +1,420 @@
+mod store;
+
+use std::collections::HashSet;
+use std::io::{BufWriter, Write};
+use std::path::Path;
+use std::sync::Arc;
+
+use log::debug;
+use milli::heed::{Env, RwTxn};
+use time::OffsetDateTime;
+
+use super::batch::BatchContent;
+use super::error::TaskError;
+use super::scheduler::Processing;
+use super::task::{Task, TaskContent, TaskId};
+use super::Result;
+use crate::tasks::task::TaskEvent;
+use crate::update_file_store::UpdateFileStore;
+
+#[cfg(test)]
+pub use store::test::MockStore as Store;
+#[cfg(not(test))]
+pub use store::Store;
+
+type FilterFn = Box<dyn Fn(&Task) -> bool + Sync + Send + 'static>;
+
+/// Defines constraints to be applied when querying for Tasks from the store.
+#[derive(Default)]
+pub struct TaskFilter {
+    indexes: Option<HashSet<String>>,
+    filter_fn: Option<FilterFn>,
+}
+
+impl TaskFilter {
+    fn pass(&self, task: &Task) -> bool {
+        match task.index_uid() {
+            Some(index_uid) => self
+                .indexes
+                .as_ref()
+                .map_or(true, |indexes| indexes.contains(index_uid)),
+            None => false,
+        }
+    }
+
+    fn filtered_indexes(&self) -> Option<&HashSet<String>> {
+        self.indexes.as_ref()
+    }
+
+    /// Adds an index to the filter, so the filter must match this index.
+    pub fn filter_index(&mut self, index: String) {
+        self.indexes
+            .get_or_insert_with(Default::default)
+            .insert(index);
+    }
+
+    pub fn filter_fn(&mut self, f: FilterFn) {
+        self.filter_fn.replace(f);
+    }
+}
+
+pub struct TaskStore {
+    store: Arc<Store>,
+}
+
+impl Clone for TaskStore {
+    fn clone(&self) -> Self {
+        Self {
+            store: self.store.clone(),
+        }
+    }
+}
+
+impl TaskStore {
+    pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
+        let store = Arc::new(Store::new(env)?);
+        Ok(Self { store })
+    }
+
+    pub async fn register(&self, content: TaskContent) -> Result<Task> {
+        debug!("registering update: {:?}", content);
+        let store = self.store.clone();
+        let task = tokio::task::spawn_blocking(move || -> Result<Task> {
+            let mut txn = store.wtxn()?;
+            let next_task_id = store.next_task_id(&mut txn)?;
+            let created_at = TaskEvent::Created(OffsetDateTime::now_utc());
+            let task = Task {
+                id: next_task_id,
+                content,
+                events: vec![created_at],
+            };
+
+            store.put(&mut txn, &task)?;
+            txn.commit()?;
+
+            Ok(task)
+        })
+        .await??;
+
+        Ok(task)
+    }
+
+    pub fn register_raw_update(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> {
+        self.store.put(wtxn, task)?;
+        Ok(())
+    }
+
+    pub async fn get_task(&self, id: TaskId, filter: Option<TaskFilter>) -> Result<Task> {
+        let store = self.store.clone();
+        let task = tokio::task::spawn_blocking(move || -> Result<_> {
+            let txn = store.rtxn()?;
+            let task = store.get(&txn, id)?;
+            Ok(task)
+        })
+        .await??
+        .ok_or(TaskError::UnexistingTask(id))?;
+
+        match filter {
+            Some(filter) => filter
+                .pass(&task)
+                .then_some(task)
+                .ok_or(TaskError::UnexistingTask(id)),
+            None => Ok(task),
+        }
+    }
+
+    /// This methods takes a `Processing` which contains the next task ids to process, and returns
+    /// the corresponding tasks along with the ownership to the passed processing.
+    ///
+    /// We need get_processing_tasks to take ownership over `Processing` because we need it to be
+    /// valid for 'static.
+    pub async fn get_processing_tasks(
+        &self,
+        processing: Processing,
+    ) -> Result<(Processing, BatchContent)> {
+        let store = self.store.clone();
+        let tasks = tokio::task::spawn_blocking(move || -> Result<_> {
+            let txn = store.rtxn()?;
+
+            let content = match processing {
+                Processing::DocumentAdditions(ref ids) => {
+                    let mut tasks = Vec::new();
+
+                    for id in ids.iter() {
+                        let task = store
+                            .get(&txn, *id)?
+                            .ok_or(TaskError::UnexistingTask(*id))?;
+                        tasks.push(task);
+                    }
+                    BatchContent::DocumentsAdditionBatch(tasks)
+                }
+                Processing::IndexUpdate(id) => {
+                    let task = store.get(&txn, id)?.ok_or(TaskError::UnexistingTask(id))?;
+                    BatchContent::IndexUpdate(task)
+                }
+                Processing::Dump(id) => {
+                    let task = store.get(&txn, id)?.ok_or(TaskError::UnexistingTask(id))?;
+                    debug_assert!(matches!(task.content, TaskContent::Dump { .. }));
+                    BatchContent::Dump(task)
+                }
+                Processing::Nothing => BatchContent::Empty,
+            };
+
+            Ok((processing, content))
+        })
+        .await??;
+
+        Ok(tasks)
+    }
+
+    pub async fn update_tasks(&self, tasks: Vec<Task>) -> Result<Vec<Task>> {
+        let store = self.store.clone();
+
+        let tasks = tokio::task::spawn_blocking(move || -> Result<_> {
+            let mut txn = store.wtxn()?;
+
+            for task in &tasks {
+                store.put(&mut txn, task)?;
+            }
+
+            txn.commit()?;
+
+            Ok(tasks)
+        })
+        .await??;
+
+        Ok(tasks)
+    }
+
+    pub async fn fetch_unfinished_tasks(&self, offset: Option<TaskId>) -> Result<Vec<Task>> {
+        let store = self.store.clone();
+
+        tokio::task::spawn_blocking(move || {
+            let txn = store.rtxn()?;
+            let tasks = store.fetch_unfinished_tasks(&txn, offset)?;
+            Ok(tasks)
+        })
+        .await?
+    }
+
+    pub async fn list_tasks(
+        &self,
+        offset: Option<TaskId>,
+        filter: Option<TaskFilter>,
+        limit: Option<usize>,
+    ) -> Result<Vec<Task>> {
+        let store = self.store.clone();
+
+        tokio::task::spawn_blocking(move || {
+            let txn = store.rtxn()?;
+            let tasks = store.list_tasks(&txn, offset, filter, limit)?;
+            Ok(tasks)
+        })
+        .await?
+    }
+
+    pub async fn dump(
+        env: Arc<Env>,
+        dir_path: impl AsRef<Path>,
+        update_file_store: UpdateFileStore,
+    ) -> Result<()> {
+        let store = Self::new(env)?;
+        let update_dir = dir_path.as_ref().join("updates");
+        let updates_file = update_dir.join("data.jsonl");
+        let tasks = store.list_tasks(None, None, None).await?;
+
+        let dir_path = dir_path.as_ref().to_path_buf();
+        tokio::task::spawn_blocking(move || -> Result<()> {
+            std::fs::create_dir(&update_dir)?;
+            let updates_file = std::fs::File::create(updates_file)?;
+            let mut updates_file = BufWriter::new(updates_file);
+
+            for task in tasks {
+                serde_json::to_writer(&mut updates_file, &task)?;
+                updates_file.write_all(b"\n")?;
+
+                if !task.is_finished() {
+                    if let Some(content_uuid) = task.get_content_uuid() {
+                        update_file_store.dump(content_uuid, &dir_path)?;
+                    }
+                }
+            }
+            updates_file.flush()?;
+            Ok(())
+        })
+        .await??;
+
+        Ok(())
+    }
+
+    pub fn load_dump(src: impl AsRef<Path>, env: Arc<Env>) -> anyhow::Result<()> {
+        // create a dummy update field store, since it is not needed right now.
+        let store = Self::new(env.clone())?;
+
+        let src_update_path = src.as_ref().join("updates");
+        let update_data = std::fs::File::open(&src_update_path.join("data.jsonl"))?;
+        let update_data = std::io::BufReader::new(update_data);
+
+        let stream = serde_json::Deserializer::from_reader(update_data).into_iter::<Task>();
+
+        let mut wtxn = env.write_txn()?;
+        for entry in stream {
+            store.register_raw_update(&mut wtxn, &entry?)?;
+        }
+        wtxn.commit()?;
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+pub mod test {
+    use crate::tasks::{scheduler::Processing, task_store::store::test::tmp_env};
+
+    use super::*;
+
+    use meilisearch_types::index_uid::IndexUid;
+    use nelson::Mocker;
+    use proptest::{
+        strategy::Strategy,
+        test_runner::{Config, TestRunner},
+    };
+
+    pub enum MockTaskStore {
+        Real(TaskStore),
+        Mock(Arc<Mocker>),
+    }
+
+    impl Clone for MockTaskStore {
+        fn clone(&self) -> Self {
+            match self {
+                Self::Real(x) => Self::Real(x.clone()),
+                Self::Mock(x) => Self::Mock(x.clone()),
+            }
+        }
+    }
+
+    impl MockTaskStore {
+        pub fn new(env: Arc<milli::heed::Env>) -> Result<Self> {
+            Ok(Self::Real(TaskStore::new(env)?))
+        }
+
+        pub async fn dump(
+            env: Arc<milli::heed::Env>,
+            path: impl AsRef<Path>,
+            update_file_store: UpdateFileStore,
+        ) -> Result<()> {
+            TaskStore::dump(env, path, update_file_store).await
+        }
+
+        pub fn mock(mocker: Mocker) -> Self {
+            Self::Mock(Arc::new(mocker))
+        }
+
+        pub async fn update_tasks(&self, tasks: Vec<Task>) -> Result<Vec<Task>> {
+            match self {
+                Self::Real(s) => s.update_tasks(tasks).await,
+                Self::Mock(m) => unsafe {
+                    m.get::<_, Result<Vec<Task>>>("update_tasks").call(tasks)
+                },
+            }
+        }
+
+        pub async fn get_task(&self, id: TaskId, filter: Option<TaskFilter>) -> Result<Task> {
+            match self {
+                Self::Real(s) => s.get_task(id, filter).await,
+                Self::Mock(m) => unsafe { m.get::<_, Result<Task>>("get_task").call((id, filter)) },
+            }
+        }
+
+        pub async fn get_processing_tasks(
+            &self,
+            tasks: Processing,
+        ) -> Result<(Processing, BatchContent)> {
+            match self {
+                Self::Real(s) => s.get_processing_tasks(tasks).await,
+                Self::Mock(m) => unsafe { m.get("get_pending_task").call(tasks) },
+            }
+        }
+
+        pub async fn fetch_unfinished_tasks(&self, from: Option<TaskId>) -> Result<Vec<Task>> {
+            match self {
+                Self::Real(s) => s.fetch_unfinished_tasks(from).await,
+                Self::Mock(m) => unsafe { m.get("fetch_unfinished_tasks").call(from) },
+            }
+        }
+
+        pub async fn list_tasks(
+            &self,
+            from: Option<TaskId>,
+            filter: Option<TaskFilter>,
+            limit: Option<usize>,
+        ) -> Result<Vec<Task>> {
+            match self {
+                Self::Real(s) => s.list_tasks(from, filter, limit).await,
+                Self::Mock(m) => unsafe { m.get("list_tasks").call((from, filter, limit)) },
+            }
+        }
+
+        pub async fn register(&self, content: TaskContent) -> Result<Task> {
+            match self {
+                Self::Real(s) => s.register(content).await,
+                Self::Mock(_m) => todo!(),
+            }
+        }
+
+        pub fn register_raw_update(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> {
+            match self {
+                Self::Real(s) => s.register_raw_update(wtxn, task),
+                Self::Mock(_m) => todo!(),
+            }
+        }
+
+        pub fn load_dump(path: impl AsRef<Path>, env: Arc<Env>) -> anyhow::Result<()> {
+            TaskStore::load_dump(path, env)
+        }
+    }
+
+    #[test]
+    fn test_increment_task_id() {
+        let tmp = tmp_env();
+        let store = Store::new(tmp.env()).unwrap();
+
+        let mut txn = store.wtxn().unwrap();
+        assert_eq!(store.next_task_id(&mut txn).unwrap(), 0);
+        txn.abort().unwrap();
+
+        let gen_task = |id: TaskId| Task {
+            id,
+            content: TaskContent::IndexCreation {
+                primary_key: None,
+                index_uid: IndexUid::new_unchecked("test"),
+            },
+            events: Vec::new(),
+        };
+
+        let mut runner = TestRunner::new(Config::default());
+        runner
+            .run(&(0..100u32).prop_map(gen_task), |task| {
+                let mut txn = store.wtxn().unwrap();
+                let previous_id = store.next_task_id(&mut txn).unwrap();
+
+                store.put(&mut txn, &task).unwrap();
+
+                let next_id = store.next_task_id(&mut txn).unwrap();
+
+                // if we put a task whose task_id is less than the next_id, then the next_id remains
+                // unchanged, otherwise it becomes task.id + 1
+                if task.id < previous_id {
+                    assert_eq!(next_id, previous_id)
+                } else {
+                    assert_eq!(next_id, task.id + 1);
+                }
+
+                txn.commit().unwrap();
+
+                Ok(())
+            })
+            .unwrap();
+    }
+}
Author	SHA1	Message	Date
curquiza	b7d9551870	Merge branch 'release-v0.30.0' into stable	2022-11-28 13:56:35 +01:00
bors[bot]	1387a211d2	Merge #3053 3053: Upgrade alpine 3.14 to 3.16 r=Kerollmops a=curquiza Otherwise CI is failing https://github.com/meilisearch/meilisearch/actions/runs/3470576605/jobs/5799173168 Co-authored-by: curquiza <clementine@meilisearch.com>	2022-11-15 13:56:45 +00:00
curquiza	661b345ad9	Upgrade alpine 3.14 to 3.16	2022-11-15 14:54:18 +01:00
bors[bot]	0f0d1dccf0	Merge #3047 3047: Fix soft deleted bug settings r=curquiza a=Kerollmops This PR fixes https://github.com/meilisearch/meilisearch/issues/3021 and fixes https://github.com/meilisearch/meilisearch/issues/2945 and is released as version 0.29.2. Co-authored-by: Kerollmops <clement@meilisearch.com>	2022-11-15 11:08:47 +00:00
Kerollmops	0331fc7c71	Make clippy happy	2022-11-15 12:07:00 +01:00
Kerollmops	5cfcdbb55a	Bump the version to v0.29.2	2022-11-14 17:39:10 +01:00
Kerollmops	c77c3a90a0	Use milli v0.33.5	2022-11-14 17:39:09 +01:00