mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-22 12:46:53 +00:00
Compare commits
4 Commits
prototype-
...
paralleliz
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4cedb5390 | ||
|
|
0b40892eca | ||
|
|
7ddd8a2b66 | ||
|
|
2addedf98a |
28
Cargo.lock
generated
28
Cargo.lock
generated
@@ -736,6 +736,12 @@ dependencies = [
|
||||
"syn 2.0.101",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxcar"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "36f64beae40a84da1b4b26ff2761a5b895c12adc41dc25aaee1c4f2bbfe97a6e"
|
||||
|
||||
[[package]]
|
||||
name = "brotli"
|
||||
version = "8.0.1"
|
||||
@@ -3928,6 +3934,7 @@ dependencies = [
|
||||
"big_s",
|
||||
"bimap",
|
||||
"bincode",
|
||||
"boxcar",
|
||||
"bstr",
|
||||
"bumpalo",
|
||||
"bumparaw-collections",
|
||||
@@ -3970,6 +3977,7 @@ dependencies = [
|
||||
"obkv",
|
||||
"once_cell",
|
||||
"ordered-float 5.0.0",
|
||||
"papaya",
|
||||
"rand 0.8.5",
|
||||
"rayon",
|
||||
"rhai",
|
||||
@@ -4411,6 +4419,16 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"seize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.4"
|
||||
@@ -5462,6 +5480,16 @@ dependencies = [
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seize"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.26"
|
||||
|
||||
@@ -109,6 +109,8 @@ utoipa = { version = "5.4.0", features = [
|
||||
"openapi_extensions",
|
||||
] }
|
||||
lru = "0.14.0"
|
||||
boxcar = "0.2.14"
|
||||
papaya = "0.2.3"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.47", default-features = false }
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use std::cell::RefCell;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{OnceLock, RwLock};
|
||||
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use hashbrown::hash_map::Entry;
|
||||
use heed::RoTxn;
|
||||
use heed::{RoTxn, WithoutTls};
|
||||
use memmap2::Mmap;
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
|
||||
use rayon::slice::ParallelSlice;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
@@ -20,8 +23,11 @@ use crate::update::new::document::{DocumentContext, Versions};
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{DocumentIdentifiers, Insertion, Update};
|
||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||
use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError};
|
||||
use crate::update::{ConcurrentAvailableIds, IndexDocumentsMethod};
|
||||
use crate::{
|
||||
DocumentId, Error, FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, Index,
|
||||
InternalError, MetadataBuilder, Result, UserError,
|
||||
};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct DocumentOperation<'pl> {
|
||||
@@ -62,90 +68,108 @@ impl<'pl> DocumentOperation<'pl> {
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
|
||||
pub fn into_changes<MSP>(
|
||||
pub fn into_changes<'e: 'pl, MSP>(
|
||||
self,
|
||||
indexer: &'pl Bump,
|
||||
index: &Index,
|
||||
rtxn: &'pl RoTxn<'pl>,
|
||||
index: &'e Index,
|
||||
rtxn: &'pl RoTxn<'pl, WithoutTls>,
|
||||
primary_key_from_op: Option<&'pl str>,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
new_fid_map: &mut FieldsIdsMap,
|
||||
must_stop_processing: &MSP,
|
||||
progress: Progress,
|
||||
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
|
||||
where
|
||||
MSP: Fn() -> bool,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
progress.update_progress(IndexingStep::PreparingPayloads);
|
||||
let Self { operations } = self;
|
||||
|
||||
let metadata_builder = MetadataBuilder::from_index(index, rtxn)?;
|
||||
let fid_map_with_meta = FieldIdMapWithMetadata::new(new_fid_map.clone(), metadata_builder);
|
||||
let global = RwLock::new(fid_map_with_meta);
|
||||
let gfid_map = GlobalFieldsIdsMap::new(&global);
|
||||
|
||||
let documents_ids = index.documents_ids(rtxn)?;
|
||||
let mut operations_stats = Vec::new();
|
||||
let mut available_docids = AvailableIds::new(&documents_ids);
|
||||
let mut docids_version_offsets = hashbrown::HashMap::new();
|
||||
let mut primary_key = None;
|
||||
let available_docids = ConcurrentAvailableIds::new(documents_ids);
|
||||
let docids_version_offsets = papaya::HashMap::new();
|
||||
// let mut docids_version_offsets = hashbrown::HashMap::new();
|
||||
let primary_key = OnceLock::new();
|
||||
|
||||
let payload_count = operations.len();
|
||||
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
for (payload_index, operation) in operations.into_iter().enumerate() {
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
}
|
||||
step.store(payload_index as u32, Ordering::Relaxed);
|
||||
|
||||
let mut bytes = 0;
|
||||
let result = match operation {
|
||||
Payload::Replace(payload) => extract_addition_payload_changes(
|
||||
indexer,
|
||||
index,
|
||||
rtxn,
|
||||
primary_key_from_op,
|
||||
&mut primary_key,
|
||||
new_fields_ids_map,
|
||||
&mut available_docids,
|
||||
&mut bytes,
|
||||
&docids_version_offsets,
|
||||
IndexDocumentsMethod::ReplaceDocuments,
|
||||
payload,
|
||||
),
|
||||
Payload::Update(payload) => extract_addition_payload_changes(
|
||||
indexer,
|
||||
index,
|
||||
rtxn,
|
||||
primary_key_from_op,
|
||||
&mut primary_key,
|
||||
new_fields_ids_map,
|
||||
&mut available_docids,
|
||||
&mut bytes,
|
||||
&docids_version_offsets,
|
||||
IndexDocumentsMethod::UpdateDocuments,
|
||||
payload,
|
||||
),
|
||||
Payload::Deletion(to_delete) => extract_deletion_payload_changes(
|
||||
index,
|
||||
rtxn,
|
||||
&mut available_docids,
|
||||
&docids_version_offsets,
|
||||
to_delete,
|
||||
),
|
||||
};
|
||||
|
||||
let mut document_count = 0;
|
||||
let error = match result {
|
||||
Ok(new_docids_version_offsets) => {
|
||||
document_count = new_docids_version_offsets.len() as u64;
|
||||
// If we don't have any error then we can merge the content of this payload
|
||||
// into to main payload. Else we just drop this payload extraction.
|
||||
merge_version_offsets(&mut docids_version_offsets, new_docids_version_offsets);
|
||||
None
|
||||
let long_txn_id = rtxn.id();
|
||||
let operations_stats = operations
|
||||
.into_par_iter()
|
||||
.enumerate()
|
||||
.map(|(payload_index, operation)| {
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
}
|
||||
Err(Error::UserError(user_error)) => Some(user_error),
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
operations_stats.push(PayloadStats { document_count, bytes, error });
|
||||
}
|
||||
step.store(payload_count as u32, Ordering::Relaxed);
|
||||
step.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let short = index.read_txn()?;
|
||||
// SAFETY: The long_txn_id comes from the main rtxn and the long lifetime is the one of the long rtxn.
|
||||
let rtxn: &'pl RoTxn<'e, _> = unsafe { extend_rtxn_lifetime(long_txn_id, &short) };
|
||||
let indexer = bumpalo::Bump::new();
|
||||
let mut gfid_map = gfid_map.clone();
|
||||
|
||||
let mut bytes = 0;
|
||||
let result = match operation {
|
||||
Payload::Replace(payload) => extract_addition_payload_changes(
|
||||
&indexer,
|
||||
index,
|
||||
&rtxn,
|
||||
primary_key_from_op,
|
||||
&primary_key,
|
||||
&mut gfid_map,
|
||||
&available_docids,
|
||||
&mut bytes,
|
||||
&docids_version_offsets,
|
||||
IndexDocumentsMethod::ReplaceDocuments,
|
||||
payload,
|
||||
),
|
||||
Payload::Update(payload) => extract_addition_payload_changes(
|
||||
&indexer,
|
||||
index,
|
||||
&rtxn,
|
||||
primary_key_from_op,
|
||||
&primary_key,
|
||||
&mut gfid_map,
|
||||
&available_docids,
|
||||
&mut bytes,
|
||||
&docids_version_offsets,
|
||||
IndexDocumentsMethod::UpdateDocuments,
|
||||
payload,
|
||||
),
|
||||
Payload::Deletion(to_delete) => extract_deletion_payload_changes(
|
||||
index,
|
||||
&rtxn,
|
||||
&available_docids,
|
||||
&docids_version_offsets,
|
||||
to_delete,
|
||||
),
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(new_docids_version_offsets) => {
|
||||
let document_count = new_docids_version_offsets.len() as u64;
|
||||
// If we don't have any error then we can merge the content of this payload
|
||||
// into to main payload. Else we just drop this payload extraction.
|
||||
merge_version_offsets(
|
||||
&mut docids_version_offsets,
|
||||
new_docids_version_offsets,
|
||||
);
|
||||
Ok(PayloadStats { document_count, bytes, error: None })
|
||||
}
|
||||
Err(Error::UserError(user_error)) => {
|
||||
Ok(PayloadStats { document_count: 0, bytes, error: Some(user_error) })
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =
|
||||
@@ -157,21 +181,25 @@ impl<'pl> DocumentOperation<'pl> {
|
||||
.sort_unstable_by_key(|(_, po)| first_update_pointer(&po.operations).unwrap_or(0));
|
||||
|
||||
let docids_version_offsets = docids_version_offsets.into_bump_slice();
|
||||
Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key))
|
||||
Ok((
|
||||
DocumentOperationChanges { docids_version_offsets },
|
||||
operations_stats,
|
||||
primary_key.into_inner(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
||||
indexer: &'pl Bump,
|
||||
indexer: &Bump,
|
||||
index: &Index,
|
||||
rtxn: &'r RoTxn<'r>,
|
||||
primary_key_from_op: Option<&'r str>,
|
||||
primary_key: &mut Option<PrimaryKey<'r>>,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
available_docids: &mut AvailableIds,
|
||||
primary_key: &OnceLock<PrimaryKey<'r>>,
|
||||
new_fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
available_docids: &ConcurrentAvailableIds,
|
||||
bytes: &mut u64,
|
||||
main_docids_version_offsets: &hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>,
|
||||
main_docids_version_offsets: &papaya::HashMap<&'pl str, PayloadOperations<'pl>>,
|
||||
method: IndexDocumentsMethod,
|
||||
payload: &'pl [u8],
|
||||
) -> Result<hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>> {
|
||||
@@ -204,10 +232,10 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
||||
Err(error) => return Err(error),
|
||||
};
|
||||
|
||||
primary_key.get_or_insert(pk)
|
||||
primary_key.get_or_init(|| pk)
|
||||
} else {
|
||||
// primary key was retrieved in the first iteration or in a previous payload
|
||||
primary_key.as_ref().unwrap()
|
||||
primary_key.get().unwrap()
|
||||
};
|
||||
|
||||
let external_id =
|
||||
@@ -313,12 +341,12 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
||||
);
|
||||
match result {
|
||||
Ok(Ok((pk, _))) => {
|
||||
primary_key.get_or_insert(pk);
|
||||
primary_key.get_or_init(|| pk);
|
||||
}
|
||||
Ok(Err(UserError::NoPrimaryKeyCandidateFound)) => (),
|
||||
Ok(Err(user_error)) => return Err(Error::UserError(user_error)),
|
||||
Err(error) => return Err(error),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(new_docids_version_offsets)
|
||||
@@ -327,7 +355,7 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
|
||||
fn extract_deletion_payload_changes<'s, 'pl: 's>(
|
||||
index: &Index,
|
||||
rtxn: &RoTxn,
|
||||
available_docids: &mut AvailableIds,
|
||||
available_docids: &ConcurrentAvailableIds,
|
||||
main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>,
|
||||
to_delete: &'pl [&'pl str],
|
||||
) -> Result<hashbrown::HashMap<&'s str, PayloadOperations<'pl>>> {
|
||||
@@ -612,3 +640,21 @@ pub fn first_update_pointer(docops: &[InnerDocOp]) -> Option<usize> {
|
||||
InnerDocOp::Deletion => None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extends the lifetime of a read-only transaction.
|
||||
/// The `long_txn_id` transaction ID must come from a call to the `RoTxn::id` method.
|
||||
///
|
||||
/// ## Panics
|
||||
///
|
||||
/// Panics if the long transaction ID does not match the transaction ID of the short transaction.
|
||||
pub unsafe fn extend_rtxn_lifetime<'e, 'long, 'short>(
|
||||
long_txn_id: usize,
|
||||
short: &'short RoTxn<'e, WithoutTls>,
|
||||
) -> &'long RoTxn<'e, WithoutTls> {
|
||||
assert_eq!(
|
||||
long_txn_id,
|
||||
short.id(),
|
||||
"Lifetimes can only be extended if they have the same transaction ID"
|
||||
);
|
||||
unsafe { std::mem::transmute(short) }
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use rustc_hash::FxBuildHasher;
|
||||
|
||||
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
use crate::update::new::StdResult;
|
||||
use crate::{FieldsIdsMap, Index, Result, UserError};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||
|
||||
/// Returns the primary key that has already been set for this index or the
|
||||
/// one we will guess by searching for the first key that contains "id" as a substring,
|
||||
@@ -12,7 +12,7 @@ use crate::{FieldsIdsMap, Index, Result, UserError};
|
||||
pub fn retrieve_or_guess_primary_key<'a>(
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
index: &Index,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
new_fields_ids_map: &mut GlobalFieldsIdsMap, // Use a &mut Mapper: MutFieldIdMapper
|
||||
primary_key_from_op: Option<&'a str>,
|
||||
first_document: Option<RawMap<'a, FxBuildHasher>>,
|
||||
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
|
||||
@@ -47,7 +47,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
|
||||
let guesses: Result<Vec<&str>> = first_document
|
||||
.keys()
|
||||
.filter_map(|name| {
|
||||
let Some(_) = new_fields_ids_map.insert(name) else {
|
||||
let Some(_) = new_fields_ids_map.id_or_insert(name) else {
|
||||
return Some(Err(UserError::AttributeLimitReached.into()));
|
||||
};
|
||||
name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name))
|
||||
|
||||
Reference in New Issue
Block a user