Compare commits

...

4 Commits

Author SHA1 Message Date
Clément Renault
c4cedb5390 WIP: Introducing papaya 2025-08-19 12:10:03 +02:00
Clément Renault
0b40892eca WIP: Ooops, I did some unsafe 2025-08-19 11:49:30 +02:00
Clément Renault
7ddd8a2b66 WIP: I need multiple rtxns and will probably use a bit of unsafe 2025-08-14 17:34:52 +02:00
Clément Renault
2addedf98a WIP: parallelizing 2025-08-14 15:16:26 +02:00
4 changed files with 159 additions and 83 deletions

28
Cargo.lock generated
View File

@@ -736,6 +736,12 @@ dependencies = [
"syn 2.0.101",
]
[[package]]
name = "boxcar"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "36f64beae40a84da1b4b26ff2761a5b895c12adc41dc25aaee1c4f2bbfe97a6e"
[[package]]
name = "brotli"
version = "8.0.1"
@@ -3928,6 +3934,7 @@ dependencies = [
"big_s",
"bimap",
"bincode",
"boxcar",
"bstr",
"bumpalo",
"bumparaw-collections",
@@ -3970,6 +3977,7 @@ dependencies = [
"obkv",
"once_cell",
"ordered-float 5.0.0",
"papaya",
"rand 0.8.5",
"rayon",
"rhai",
@@ -4411,6 +4419,16 @@ dependencies = [
"winapi",
]
[[package]]
name = "papaya"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
dependencies = [
"equivalent",
"seize",
]
[[package]]
name = "parking_lot"
version = "0.12.4"
@@ -5462,6 +5480,16 @@ dependencies = [
"time",
]
[[package]]
name = "seize"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "semver"
version = "1.0.26"

View File

@@ -109,6 +109,8 @@ utoipa = { version = "5.4.0", features = [
"openapi_extensions",
] }
lru = "0.14.0"
boxcar = "0.2.14"
papaya = "0.2.3"
[dev-dependencies]
mimalloc = { version = "0.1.47", default-features = false }

View File

@@ -1,11 +1,14 @@
use std::cell::RefCell;
use std::sync::atomic::Ordering;
use std::sync::{OnceLock, RwLock};
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use hashbrown::hash_map::Entry;
use heed::RoTxn;
use heed::{RoTxn, WithoutTls};
use memmap2::Mmap;
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
use rayon::slice::ParallelSlice;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue;
@@ -20,8 +23,11 @@ use crate::update::new::document::{DocumentContext, Versions};
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::MostlySend;
use crate::update::new::{DocumentIdentifiers, Insertion, Update};
use crate::update::{AvailableIds, IndexDocumentsMethod};
use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError};
use crate::update::{ConcurrentAvailableIds, IndexDocumentsMethod};
use crate::{
DocumentId, Error, FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, Index,
InternalError, MetadataBuilder, Result, UserError,
};
#[derive(Default)]
pub struct DocumentOperation<'pl> {
@@ -62,90 +68,108 @@ impl<'pl> DocumentOperation<'pl> {
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
pub fn into_changes<MSP>(
pub fn into_changes<'e: 'pl, MSP>(
self,
indexer: &'pl Bump,
index: &Index,
rtxn: &'pl RoTxn<'pl>,
index: &'e Index,
rtxn: &'pl RoTxn<'pl, WithoutTls>,
primary_key_from_op: Option<&'pl str>,
new_fields_ids_map: &mut FieldsIdsMap,
new_fid_map: &mut FieldsIdsMap,
must_stop_processing: &MSP,
progress: Progress,
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
where
MSP: Fn() -> bool,
MSP: Fn() -> bool + Sync,
{
progress.update_progress(IndexingStep::PreparingPayloads);
let Self { operations } = self;
let metadata_builder = MetadataBuilder::from_index(index, rtxn)?;
let fid_map_with_meta = FieldIdMapWithMetadata::new(new_fid_map.clone(), metadata_builder);
let global = RwLock::new(fid_map_with_meta);
let gfid_map = GlobalFieldsIdsMap::new(&global);
let documents_ids = index.documents_ids(rtxn)?;
let mut operations_stats = Vec::new();
let mut available_docids = AvailableIds::new(&documents_ids);
let mut docids_version_offsets = hashbrown::HashMap::new();
let mut primary_key = None;
let available_docids = ConcurrentAvailableIds::new(documents_ids);
let docids_version_offsets = papaya::HashMap::new();
// let mut docids_version_offsets = hashbrown::HashMap::new();
let primary_key = OnceLock::new();
let payload_count = operations.len();
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
progress.update_progress(progress_step);
for (payload_index, operation) in operations.into_iter().enumerate() {
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
}
step.store(payload_index as u32, Ordering::Relaxed);
let mut bytes = 0;
let result = match operation {
Payload::Replace(payload) => extract_addition_payload_changes(
indexer,
index,
rtxn,
primary_key_from_op,
&mut primary_key,
new_fields_ids_map,
&mut available_docids,
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::ReplaceDocuments,
payload,
),
Payload::Update(payload) => extract_addition_payload_changes(
indexer,
index,
rtxn,
primary_key_from_op,
&mut primary_key,
new_fields_ids_map,
&mut available_docids,
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::UpdateDocuments,
payload,
),
Payload::Deletion(to_delete) => extract_deletion_payload_changes(
index,
rtxn,
&mut available_docids,
&docids_version_offsets,
to_delete,
),
};
let mut document_count = 0;
let error = match result {
Ok(new_docids_version_offsets) => {
document_count = new_docids_version_offsets.len() as u64;
// If we don't have any error then we can merge the content of this payload
// into to main payload. Else we just drop this payload extraction.
merge_version_offsets(&mut docids_version_offsets, new_docids_version_offsets);
None
let long_txn_id = rtxn.id();
let operations_stats = operations
.into_par_iter()
.enumerate()
.map(|(payload_index, operation)| {
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
}
Err(Error::UserError(user_error)) => Some(user_error),
Err(e) => return Err(e),
};
operations_stats.push(PayloadStats { document_count, bytes, error });
}
step.store(payload_count as u32, Ordering::Relaxed);
step.fetch_add(1, Ordering::Relaxed);
let short = index.read_txn()?;
// SAFETY: The long_txn_id comes from the main rtxn and the long lifetime is the one of the long rtxn.
let rtxn: &'pl RoTxn<'e, _> = unsafe { extend_rtxn_lifetime(long_txn_id, &short) };
let indexer = bumpalo::Bump::new();
let mut gfid_map = gfid_map.clone();
let mut bytes = 0;
let result = match operation {
Payload::Replace(payload) => extract_addition_payload_changes(
&indexer,
index,
&rtxn,
primary_key_from_op,
&primary_key,
&mut gfid_map,
&available_docids,
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::ReplaceDocuments,
payload,
),
Payload::Update(payload) => extract_addition_payload_changes(
&indexer,
index,
&rtxn,
primary_key_from_op,
&primary_key,
&mut gfid_map,
&available_docids,
&mut bytes,
&docids_version_offsets,
IndexDocumentsMethod::UpdateDocuments,
payload,
),
Payload::Deletion(to_delete) => extract_deletion_payload_changes(
index,
&rtxn,
&available_docids,
&docids_version_offsets,
to_delete,
),
};
match result {
Ok(new_docids_version_offsets) => {
let document_count = new_docids_version_offsets.len() as u64;
// If we don't have any error then we can merge the content of this payload
// into to main payload. Else we just drop this payload extraction.
merge_version_offsets(
&mut docids_version_offsets,
new_docids_version_offsets,
);
Ok(PayloadStats { document_count, bytes, error: None })
}
Err(Error::UserError(user_error)) => {
Ok(PayloadStats { document_count: 0, bytes, error: Some(user_error) })
}
Err(e) => Err(e),
}
})
.collect::<Result<Vec<_>>>()?;
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =
@@ -157,21 +181,25 @@ impl<'pl> DocumentOperation<'pl> {
.sort_unstable_by_key(|(_, po)| first_update_pointer(&po.operations).unwrap_or(0));
let docids_version_offsets = docids_version_offsets.into_bump_slice();
Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key))
Ok((
DocumentOperationChanges { docids_version_offsets },
operations_stats,
primary_key.into_inner(),
))
}
}
#[allow(clippy::too_many_arguments)]
fn extract_addition_payload_changes<'r, 'pl: 'r>(
indexer: &'pl Bump,
indexer: &Bump,
index: &Index,
rtxn: &'r RoTxn<'r>,
primary_key_from_op: Option<&'r str>,
primary_key: &mut Option<PrimaryKey<'r>>,
new_fields_ids_map: &mut FieldsIdsMap,
available_docids: &mut AvailableIds,
primary_key: &OnceLock<PrimaryKey<'r>>,
new_fields_ids_map: &mut GlobalFieldsIdsMap,
available_docids: &ConcurrentAvailableIds,
bytes: &mut u64,
main_docids_version_offsets: &hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>,
main_docids_version_offsets: &papaya::HashMap<&'pl str, PayloadOperations<'pl>>,
method: IndexDocumentsMethod,
payload: &'pl [u8],
) -> Result<hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>> {
@@ -204,10 +232,10 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
Err(error) => return Err(error),
};
primary_key.get_or_insert(pk)
primary_key.get_or_init(|| pk)
} else {
// primary key was retrieved in the first iteration or in a previous payload
primary_key.as_ref().unwrap()
primary_key.get().unwrap()
};
let external_id =
@@ -313,12 +341,12 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
);
match result {
Ok(Ok((pk, _))) => {
primary_key.get_or_insert(pk);
primary_key.get_or_init(|| pk);
}
Ok(Err(UserError::NoPrimaryKeyCandidateFound)) => (),
Ok(Err(user_error)) => return Err(Error::UserError(user_error)),
Err(error) => return Err(error),
};
}
}
Ok(new_docids_version_offsets)
@@ -327,7 +355,7 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
fn extract_deletion_payload_changes<'s, 'pl: 's>(
index: &Index,
rtxn: &RoTxn,
available_docids: &mut AvailableIds,
available_docids: &ConcurrentAvailableIds,
main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>,
to_delete: &'pl [&'pl str],
) -> Result<hashbrown::HashMap<&'s str, PayloadOperations<'pl>>> {
@@ -612,3 +640,21 @@ pub fn first_update_pointer(docops: &[InnerDocOp]) -> Option<usize> {
InnerDocOp::Deletion => None,
})
}
/// Extends the lifetime of a read-only transaction.
/// The `long_txn_id` transaction ID must come from a call to the `RoTxn::id` method.
///
/// ## Panics
///
/// Panics if the long transaction ID does not match the transaction ID of the short transaction.
pub unsafe fn extend_rtxn_lifetime<'e, 'long, 'short>(
long_txn_id: usize,
short: &'short RoTxn<'e, WithoutTls>,
) -> &'long RoTxn<'e, WithoutTls> {
assert_eq!(
long_txn_id,
short.id(),
"Lifetimes can only be extended if they have the same transaction ID"
);
unsafe { std::mem::transmute(short) }
}

View File

@@ -4,7 +4,7 @@ use rustc_hash::FxBuildHasher;
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
use crate::update::new::StdResult;
use crate::{FieldsIdsMap, Index, Result, UserError};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
/// Returns the primary key that has already been set for this index or the
/// one we will guess by searching for the first key that contains "id" as a substring,
@@ -12,7 +12,7 @@ use crate::{FieldsIdsMap, Index, Result, UserError};
pub fn retrieve_or_guess_primary_key<'a>(
rtxn: &'a RoTxn<'a>,
index: &Index,
new_fields_ids_map: &mut FieldsIdsMap,
new_fields_ids_map: &mut GlobalFieldsIdsMap, // Use a &mut Mapper: MutFieldIdMapper
primary_key_from_op: Option<&'a str>,
first_document: Option<RawMap<'a, FxBuildHasher>>,
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
@@ -47,7 +47,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
let guesses: Result<Vec<&str>> = first_document
.keys()
.filter_map(|name| {
let Some(_) = new_fields_ids_map.insert(name) else {
let Some(_) = new_fields_ids_map.id_or_insert(name) else {
return Some(Err(UserError::AttributeLimitReached.into()));
};
name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name))