mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-09 06:06:30 +00:00
Merge branch 'main' into indexer-edition-2024
This commit is contained in:
83
crates/milli/src/update/new/indexer/partial_dump.rs
Normal file
83
crates/milli/src/update/new/indexer/partial_dump.rs
Normal file
@ -0,0 +1,83 @@
|
||||
use std::ops::DerefMut;
|
||||
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt};
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
use crate::update::new::document::DocumentFromVersions;
|
||||
use crate::update::new::document_change::Versions;
|
||||
use crate::update::new::{DocumentChange, Insertion};
|
||||
use crate::{Error, InternalError, Result, UserError};
|
||||
|
||||
pub struct PartialDump<I> {
|
||||
iter: I,
|
||||
}
|
||||
|
||||
impl<I> PartialDump<I> {
|
||||
pub fn new_from_jsonlines(iter: I) -> Self {
|
||||
PartialDump { iter }
|
||||
}
|
||||
|
||||
pub fn into_changes<'index>(
|
||||
self,
|
||||
concurrent_available_ids: &'index ConcurrentAvailableIds,
|
||||
primary_key: &'index PrimaryKey,
|
||||
) -> PartialDumpChanges<'index, I> {
|
||||
/// Note for future self:
|
||||
/// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
|
||||
PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PartialDumpChanges<'doc, I> {
|
||||
iter: I,
|
||||
concurrent_available_ids: &'doc ConcurrentAvailableIds,
|
||||
primary_key: &'doc PrimaryKey<'doc>,
|
||||
}
|
||||
|
||||
impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>
|
||||
where
|
||||
Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,
|
||||
{
|
||||
type Item = Box<RawValue>;
|
||||
|
||||
fn iter(&self) -> impl IndexedParallelIterator<Item = Self::Item> {
|
||||
self.iter.clone()
|
||||
}
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
document: Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>>
|
||||
where
|
||||
'index: 'doc,
|
||||
{
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
let docid = match self.concurrent_available_ids.next() {
|
||||
Some(id) => id,
|
||||
None => return Err(Error::UserError(UserError::DocumentLimitReached)),
|
||||
};
|
||||
|
||||
let mut fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
let fields_ids_map = fields_ids_map.deref_mut();
|
||||
|
||||
let document = doc_alloc.alloc_str(document.get());
|
||||
let document: &RawValue = unsafe { std::mem::transmute(document) };
|
||||
|
||||
let external_document_id =
|
||||
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
|
||||
let external_document_id = external_document_id.to_de();
|
||||
|
||||
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let document = document.into_bump_slice();
|
||||
let document = DocumentFromVersions::new(Versions::Single(document));
|
||||
|
||||
let insertion = Insertion::create(docid, external_document_id, document);
|
||||
Ok(Some(DocumentChange::Insertion(insertion)))
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user