WIP sending multiparts of 250MiB

2025-11-22 04:36:32 +00:00 · 2025-10-14 17:05:51 +02:00
parent 9105e8bb8c
commit c1f7542dfa
1 changed files with 181 additions and 141 deletions
--- a/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
+++ b/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
@@ -285,7 +285,19 @@ impl IndexScheduler {
        use async_compression::tokio::write::GzipEncoder;
        use async_compression::Level;
        use bytes::{Bytes, BytesMut};
        use rusty_s3::actions::UploadPart;
        use tokio::fs::File;
        use tokio::io::AsyncReadExt;
        use tokio::task::JoinHandle;
        const ONE_HOUR: Duration = Duration::from_secs(3600);
        // default part size is 250MiB
        const MIN_PART_SIZE: usize = 250 * 1024 * 1024;
        // 10MiB
        const TEN_MIB: usize = 10 * 1024 * 1024;
        // The maximum number of parts that can be uploaded to a single multipart upload.
        const MAX_NUMBER_PARTS: usize = 10_000;
        // The maximum number of parts that can be uploaded in parallel.
        const S3_MAX_IN_FLIGHT_PARTS: &str = "MEILI_S3_MAX_IN_FLIGHT_PARTS";
@@ -299,69 +311,185 @@ impl IndexScheduler {
        let url = bucket_url.parse().unwrap();
        let bucket = Bucket::new(url, UrlStyle::Path, bucket_name, bucket_region).unwrap();
        let credential = Credentials::new(access_key, secret_key);
        // TODO change this and use the database name like in the original version
        let object = "data.ms.snapshot";
-        let (writer, reader) = tokio::net::unix::pipe::pipe()?;
+        // TODO implement exponential backoff on upload requests: https://docs.rs/backoff
-        let compressed_writer = GzipEncoder::with_quality(writer, Level::Fastest);
+        // TODO return a result with actual errors
-        let mut tarball = tokio_tar::Builder::new(compressed_writer);
+        // TODO sign for longer than an hour?
        // TODO Use a better thing than a String for the object path
        let (writer, mut reader) = tokio::net::unix::pipe::pipe()?;
        let uploader_task = tokio::spawn(async move {
            let action = bucket.create_multipart_upload(Some(&credential), object);
            // TODO Question: If it is only signed for an hour and a snapshot takes longer than an hour, what happens?
            //                If the part is deleted (like a TTL) we should sign it for at least 24 hours.
            let url = action.sign(ONE_HOUR);
            let resp = client.post(url).send().await.unwrap().error_for_status().unwrap();
            let body = resp.text().await.unwrap();
-        // 1. Snapshot the version file
+            let multipart = CreateMultipartUpload::parse_response(&body).unwrap();
-        tarball.append_path_with_name(&self.scheduler.version_file_path, VERSION_FILE_NAME).await?;
+            let mut etags = Vec::<String>::new();
-        // 2. Snapshot the index scheduler LMDB env
+            let mut in_flight = VecDeque::<(
-        progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler);
+                JoinHandle<std::result::Result<reqwest::Response, reqwest::Error>>,
-        let mut tasks_env_file = self.env.try_clone_inner_file().map(File::from_std)?;
+                Bytes,
-        let path = Path::new("tasks").join("data.mdb");
+            )>::with_capacity(max_in_flight_parts);
-        tarball.append_file(path, &mut tasks_env_file).await?;
+            for part_number in 1u16.. {
-        drop(tasks_env_file);
+                let part_upload = bucket.upload_part(
                    Some(&credential),
                    object,
                    part_number,
                    multipart.upload_id(),
                );
                let url = part_upload.sign(ONE_HOUR);
-        // 2.3 Create a read transaction on the index-scheduler
+                // Wait for a buffer to be ready if there are in-flight parts that landed
-        let rtxn = self.env.read_txn()?;
+                let mut buffer = if in_flight.len() >= max_in_flight_parts {
                    let (request, buffer) = in_flight.pop_front().unwrap();
                    let mut buffer = buffer.try_into_mut().expect("to convert into a mut buffer");
                    let resp = request.await.unwrap().unwrap().error_for_status().unwrap();
                    let etag =
                        resp.headers().get(ETAG).expect("every UploadPart request returns an Etag");
                    // TODO use bumpalo to reduce the number of allocations
                    etags.push(etag.to_str().unwrap().to_owned());
                    buffer.clear();
                    buffer
                } else {
                    // TODO Base this on the available memory
                    BytesMut::with_capacity(MIN_PART_SIZE)
                };
-        // 2.4 Create the update files directory
+                while buffer.len() < (MIN_PART_SIZE / 2) {
-        //     And only copy the update files of the enqueued tasks
+                    if reader.read_buf(&mut buffer).await? == 0 {
-        progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles);
+                        break;
-        let enqueued = self.queue.tasks.get_status(&rtxn, Status::Enqueued)?;
+                    }
-        let (atomic, update_file_progress) = AtomicUpdateFileStep::new(enqueued.len() as u32);
+                }
-        progress.update_progress(update_file_progress);
+
-        let update_files_dir = Path::new("update_files");
+                if buffer.is_empty() {
-        for task_id in enqueued {
+                    break;
-            let task =
+                }
-                self.queue.tasks.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
+
-            if let Some(content_uuid) = task.content_uuid() {
+                let body = buffer.freeze();
-                let src = self.queue.file_store.update_path(content_uuid);
+                let task = tokio::spawn(client.put(url).body(body.clone()).send());
-                let mut update_file = File::open(src).await?;
+                in_flight.push_back((task, body));
                let path = update_files_dir.join(content_uuid.to_string());
                tarball.append_file(path, &mut update_file).await?;
            }
            atomic.fetch_add(1, Ordering::Relaxed);
        }
-        // 3. Snapshot every indexes
+            for (join_handle, _buffer) in in_flight {
-        progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes);
+                let resp = join_handle.await.unwrap().unwrap().error_for_status().unwrap();
-        let index_mapping = self.index_mapper.index_mapping;
+                let etag =
-        let nb_indexes = index_mapping.len(&rtxn)? as u32;
+                    resp.headers().get(ETAG).expect("every UploadPart request returns an Etag");
-        let indexes_dir = Path::new("indexes");
+                // TODO use bumpalo to reduce the number of allocations
-        for (i, result) in index_mapping.iter(&rtxn)?.enumerate() {
+                etags.push(etag.to_str().unwrap().to_owned());
-            let (name, uuid) = result?;
+            }
            progress.update_progress(VariableNameStep::<SnapshotCreationProgress>::new(
                name, i as u32, nb_indexes,
            ));
            let index = self.index_mapper.index(&rtxn, name)?;
            let path = indexes_dir.join(uuid.to_string());
            let mut index_file = index.try_clone_inner_file().map(File::from_std).unwrap();
            tarball.append_file(path, &mut index_file).await?;
        }
-        drop(rtxn);
+            let action = bucket.complete_multipart_upload(
                Some(&credential),
                object,
                multipart.upload_id(),
                etags.iter().map(AsRef::as_ref),
            );
            let url = action.sign(ONE_HOUR);
            let resp = client
                .post(url)
                .body(action.body())
                .send()
                .await
                .unwrap()
                .error_for_status()
                .unwrap();
-        // 4. Snapshot the auth LMDB env
+            // TODO do a better check and do not assert
-        progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys);
+            assert!(resp.status().is_success());
        let mut auth_env_file =
            self.scheduler.auth_env.try_clone_inner_file().map(File::from_std).unwrap();
        let path = Path::new("auth").join("data.mdb");
        tarball.append_file(path, &mut auth_env_file).await?;
-        tarball.finish().await?;
+            Result::<_, Error>::Ok(())
        });
        // TODO not a big fan of this clone
        //      remove it and get all the necessary data from the scheduler
        let index_scheduler = IndexScheduler::private_clone(self);
        let builder_task = tokio::task::spawn_local(async move {
            let compressed_writer = GzipEncoder::with_quality(writer, Level::Fastest);
            let mut tarball = tokio_tar::Builder::new(compressed_writer);
            // 1. Snapshot the version file
            tarball
                .append_path_with_name(
                    &index_scheduler.scheduler.version_file_path,
                    VERSION_FILE_NAME,
                )
                .await?;
            // 2. Snapshot the index scheduler LMDB env
            progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler);
            let mut tasks_env_file =
                index_scheduler.env.try_clone_inner_file().map(File::from_std)?;
            let path = Path::new("tasks").join("data.mdb");
            tarball.append_file(path, &mut tasks_env_file).await?;
            drop(tasks_env_file);
            // 2.3 Create a read transaction on the index-scheduler
            let rtxn = index_scheduler.env.read_txn()?;
            // 2.4 Create the update files directory
            //     And only copy the update files of the enqueued tasks
            progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles);
            let enqueued = index_scheduler.queue.tasks.get_status(&rtxn, Status::Enqueued)?;
            let (atomic, update_file_progress) = AtomicUpdateFileStep::new(enqueued.len() as u32);
            progress.update_progress(update_file_progress);
            let update_files_dir = Path::new("update_files");
            for task_id in enqueued {
                let task = index_scheduler
                    .queue
                    .tasks
                    .get_task(&rtxn, task_id)?
                    .ok_or(Error::CorruptedTaskQueue)?;
                if let Some(content_uuid) = task.content_uuid() {
                    let src = index_scheduler.queue.file_store.update_path(content_uuid);
                    let mut update_file = File::open(src).await?;
                    let path = update_files_dir.join(content_uuid.to_string());
                    tarball.append_file(path, &mut update_file).await?;
                }
                atomic.fetch_add(1, Ordering::Relaxed);
            }
            // 3. Snapshot every indexes
            progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes);
            let index_mapping = index_scheduler.index_mapper.index_mapping;
            let nb_indexes = index_mapping.len(&rtxn)? as u32;
            let indexes_dir = Path::new("indexes");
            for (i, result) in index_mapping.iter(&rtxn)?.enumerate() {
                let (name, uuid) = result?;
                progress.update_progress(VariableNameStep::<SnapshotCreationProgress>::new(
                    name, i as u32, nb_indexes,
                ));
                let index = index_scheduler.index_mapper.index(&rtxn, name)?;
                let path = indexes_dir.join(uuid.to_string());
                let mut index_file = index.try_clone_inner_file().map(File::from_std).unwrap();
                tarball.append_file(path, &mut index_file).await?;
            }
            drop(rtxn);
            // 4. Snapshot the auth LMDB env
            progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys);
            let mut auth_env_file = index_scheduler
                .scheduler
                .auth_env
                .try_clone_inner_file()
                .map(File::from_std)
                .unwrap();
            let path = Path::new("auth").join("data.mdb");
            tarball.append_file(path, &mut auth_env_file).await?;
            tarball.finish().await?;
            Result::<_, Error>::Ok(())
        });
        let (uploader_result, builder_result) = tokio::join!(uploader_task, builder_task);
        uploader_result.unwrap()?;
        builder_result.unwrap()?;
        for task in &mut tasks {
            task.status = Status::Succeeded;
@@ -370,91 +498,3 @@ impl IndexScheduler {
        Ok(tasks)
    }
 }
 // TODO implement exponential backoff on upload requests: https://docs.rs/backoff
 // TODO return a result with actual errors
 // TODO sign for longer than an hour?
 // TODO Use a better thing than a String for the object path
 async fn multipart_upload(
    bucket: &Bucket,
    client: &Client,
    credential: Option<&Credentials>,
    max_in_flight_parts: usize,
    bytes: bytes::Bytes,
    object: &str,
 ) -> Result<()> {
    const ONE_HOUR: Duration = Duration::from_secs(3600);
    // default part size is 250MiB
    const MIN_PART_SIZE: usize = 250 * 1024 * 1024;
    // 10MiB
    const TEN_MIB: usize = 10 * 1024 * 1024;
    // The maximum number of parts that can be uploaded to a single multipart upload.
    const MAX_NUMBER_PARTS: usize = 10_000;
    let action = bucket.create_multipart_upload(credential, object);
    // TODO Question: If it is only signed for an hour and a snapshot takes longer than an hour, what happens?
    //                If the part is deleted (like a TTL) we should sign it for at least 24 hours.
    let url = action.sign(ONE_HOUR);
    let resp = client.post(url).send().await.unwrap().error_for_status().unwrap();
    let body = resp.text().await.unwrap();
    let multipart = CreateMultipartUpload::parse_response(&body).unwrap();
    let mut etags = Vec::<String>::new();
    let part_size = bytes.len() / MAX_NUMBER_PARTS;
    let part_size = if part_size < TEN_MIB { MIN_PART_SIZE } else { part_size };
    let mut in_flight_parts = VecDeque::with_capacity(max_in_flight_parts);
    let number_of_parts = bytes.len().div_ceil(part_size);
    for i in 0..number_of_parts {
        let part_number = u16::try_from(i).unwrap().checked_add(1).unwrap();
        let part_upload =
            bucket.upload_part(credential, object, part_number, multipart.upload_id());
        let url = part_upload.sign(ONE_HOUR);
        // Make sure we do not read out of bound
        let body = if bytes.len() < part_size * (i + 1) {
            bytes.slice(part_size * i..)
        } else {
            bytes.slice(part_size * i..part_size * (i + 1))
        };
        let task = tokio::spawn(client.put(url).body(body).send());
        in_flight_parts.push_back(task);
        if in_flight_parts.len() == max_in_flight_parts {
            let resp = in_flight_parts
                .pop_front()
                .unwrap()
                .await
                .unwrap()
                .unwrap()
                .error_for_status()
                .unwrap();
            let etag = resp.headers().get(ETAG).expect("every UploadPart request returns an Etag");
            // TODO use bumpalo to reduce the number of allocations
            etags.push(etag.to_str().unwrap().to_owned());
        }
    }
    for join_handle in in_flight_parts {
        let resp = join_handle.await.unwrap().unwrap().error_for_status().unwrap();
        let etag = resp.headers().get(ETAG).expect("every UploadPart request returns an Etag");
        // TODO use bumpalo to reduce the number of allocations
        etags.push(etag.to_str().unwrap().to_owned());
    }
    let action = bucket.complete_multipart_upload(
        credential,
        object,
        multipart.upload_id(),
        etags.iter().map(AsRef::as_ref),
    );
    let url = action.sign(ONE_HOUR);
    let resp =
        client.post(url).body(action.body()).send().await.unwrap().error_for_status().unwrap();
    assert!(resp.status().is_success());
    Ok(())
 }