Prefer using a stable than a random hash builder

This commit is contained in:
Kerollmops
2024-12-10 14:25:53 +01:00
parent 6b269795d2
commit a751972c57
8 changed files with 40 additions and 23 deletions

View File

@@ -2,6 +2,7 @@ use std::collections::{BTreeMap, BTreeSet};
use bumparaw_collections::RawMap; use bumparaw_collections::RawMap;
use heed::RoTxn; use heed::RoTxn;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use super::vector_document::VectorDocument; use super::vector_document::VectorDocument;
@@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue);
#[derive(Debug)] #[derive(Debug)]
pub struct Versions<'doc> { pub struct Versions<'doc> {
data: RawMap<'doc>, data: RawMap<'doc, FxBuildHasher>,
} }
impl<'doc> Versions<'doc> { impl<'doc> Versions<'doc> {
pub fn multiple( pub fn multiple(
mut versions: impl Iterator<Item = Result<RawMap<'doc>>>, mut versions: impl Iterator<Item = Result<RawMap<'doc, FxBuildHasher>>>,
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let Some(data) = versions.next() else { return Ok(None) }; let Some(data) = versions.next() else { return Ok(None) };
let mut data = data?; let mut data = data?;
@@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> {
Ok(Some(Self::single(data))) Ok(Some(Self::single(data)))
} }
pub fn single(version: RawMap<'doc>) -> Self { pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self {
Self { data: version } Self { data: version }
} }

View File

@@ -179,6 +179,7 @@ mod test {
use bumparaw_collections::RawMap; use bumparaw_collections::RawMap;
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
use meili_snap::snapshot; use meili_snap::snapshot;
use rustc_hash::FxBuildHasher;
use serde_json::json; use serde_json::json;
use serde_json::value::RawValue; use serde_json::value::RawValue;
@@ -234,7 +235,7 @@ mod test {
let bump = Bump::new(); let bump = Bump::new();
let document: &RawValue = serde_json::from_str(&document).unwrap(); let document: &RawValue = serde_json::from_str(&document).unwrap();
let document = RawMap::from_raw_value(document, &bump).unwrap(); let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap();
let document = Versions::single(document); let document = Versions::single(document);
let document = DocumentFromVersions::new(&document); let document = DocumentFromVersions::new(&document);

View File

@@ -2,6 +2,7 @@ use std::ops::ControlFlow;
use bumpalo::Bump; use bumpalo::Bump;
use bumparaw_collections::RawVec; use bumparaw_collections::RawVec;
use rustc_hash::FxBuildHasher;
use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde::de::{DeserializeSeed, Deserializer as _, Visitor};
use serde_json::value::RawValue; use serde_json::value::RawValue;
@@ -394,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> {
} }
pub struct DeserrRawMap<'a> { pub struct DeserrRawMap<'a> {
map: bumparaw_collections::RawMap<'a>, map: bumparaw_collections::RawMap<'a, FxBuildHasher>,
alloc: &'a Bump, alloc: &'a Bump,
} }

View File

@@ -5,6 +5,7 @@ use hashbrown::hash_map::Entry;
use heed::RoTxn; use heed::RoTxn;
use memmap2::Mmap; use memmap2::Mmap;
use rayon::slice::ParallelSlice; use rayon::slice::ParallelSlice;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use serde_json::Deserializer; use serde_json::Deserializer;
@@ -166,8 +167,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
// Only guess the primary key if it is the first document // Only guess the primary key if it is the first document
let retrieved_primary_key = if previous_offset == 0 { let retrieved_primary_key = if previous_offset == 0 {
let doc = let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer)
RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; .map(Some)
.map_err(UserError::SerdeJson)?;
let result = retrieve_or_guess_primary_key( let result = retrieve_or_guess_primary_key(
rtxn, rtxn,
@@ -546,7 +548,8 @@ impl MergeChanges for MergeDocumentForReplacement {
Some(InnerDocOp::Addition(DocumentOffset { content })) => { Some(InnerDocOp::Addition(DocumentOffset { content })) => {
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = let document =
RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
if is_new { if is_new {
Ok(Some(DocumentChange::Insertion(Insertion::create( Ok(Some(DocumentChange::Insertion(Insertion::create(
@@ -633,7 +636,8 @@ impl MergeChanges for MergeDocumentForUpdates {
}; };
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = let document =
RawMap::from_raw_value(document, doc_alloc).map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
Some(Versions::single(document)) Some(Versions::single(document))
} }
@@ -647,8 +651,9 @@ impl MergeChanges for MergeDocumentForUpdates {
}; };
let document = serde_json::from_slice(content).unwrap(); let document = serde_json::from_slice(content).unwrap();
let document = RawMap::from_raw_value(document, doc_alloc) let document =
.map_err(UserError::SerdeJson)?; RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
Ok(document) Ok(document)
}); });
Versions::multiple(versions)? Versions::multiple(versions)?

View File

@@ -14,6 +14,7 @@ use heed::{RoTxn, RwTxn};
use itertools::{merge_join_by, EitherOrBoth}; use itertools::{merge_join_by, EitherOrBoth};
pub use partial_dump::PartialDump; pub use partial_dump::PartialDump;
use rand::SeedableRng as _; use rand::SeedableRng as _;
use rustc_hash::FxBuildHasher;
use time::OffsetDateTime; use time::OffsetDateTime;
pub use update_by_function::UpdateByFunction; pub use update_by_function::UpdateByFunction;
@@ -776,7 +777,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
index: &Index, index: &Index,
new_fields_ids_map: &mut FieldsIdsMap, new_fields_ids_map: &mut FieldsIdsMap,
primary_key_from_op: Option<&'a str>, primary_key_from_op: Option<&'a str>,
first_document: Option<RawMap<'a>>, first_document: Option<RawMap<'a, FxBuildHasher>>,
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> { ) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.

View File

@@ -2,6 +2,7 @@ use std::ops::DerefMut;
use bumparaw_collections::RawMap; use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator; use rayon::iter::IndexedParallelIterator;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue; use serde_json::value::RawValue;
use super::document_changes::{DocumentChangeContext, DocumentChanges}; use super::document_changes::{DocumentChangeContext, DocumentChanges};
@@ -76,8 +77,8 @@ where
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
let external_document_id = external_document_id.to_de(); let external_document_id = external_document_id.to_de();
let document = let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
RawMap::from_raw_value(document, doc_alloc).map_err(InternalError::SerdeJson)?; .map_err(InternalError::SerdeJson)?;
let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); let insertion = Insertion::create(docid, external_document_id, Versions::single(document));
Ok(Some(DocumentChange::Insertion(insertion))) Ok(Some(DocumentChange::Insertion(insertion)))

View File

@@ -3,6 +3,7 @@ use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _; use rayon::slice::ParallelSlice as _;
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;
use super::document_changes::DocumentChangeContext; use super::document_changes::DocumentChangeContext;
use super::DocumentChanges; use super::DocumentChanges;
@@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
if document_id != new_document_id { if document_id != new_document_id {
Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey))
} else { } else {
let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) let raw_new_doc = RawMap::from_raw_value_and_hasher(
.map_err(InternalError::SerdeJson)?; raw_new_doc,
FxBuildHasher,
doc_alloc,
)
.map_err(InternalError::SerdeJson)?;
Ok(Some(DocumentChange::Update(Update::create( Ok(Some(DocumentChange::Update(Update::create(
docid, docid,

View File

@@ -4,6 +4,7 @@ use bumpalo::Bump;
use bumparaw_collections::RawMap; use bumparaw_collections::RawMap;
use deserr::{Deserr, IntoValue}; use deserr::{Deserr, IntoValue};
use heed::RoTxn; use heed::RoTxn;
use rustc_hash::FxBuildHasher;
use serde::Serialize; use serde::Serialize;
use serde_json::value::RawValue; use serde_json::value::RawValue;
@@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> {
docid: DocumentId, docid: DocumentId,
embedding_config: Vec<IndexEmbeddingConfig>, embedding_config: Vec<IndexEmbeddingConfig>,
index: &'t Index, index: &'t Index,
vectors_field: Option<RawMap<'t>>, vectors_field: Option<RawMap<'t, FxBuildHasher>>,
rtxn: &'t RoTxn<'t>, rtxn: &'t RoTxn<'t>,
doc_alloc: &'t Bump, doc_alloc: &'t Bump,
} }
@@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> {
}; };
let vectors = document.vectors_field()?; let vectors = document.vectors_field()?;
let vectors_field = match vectors { let vectors_field = match vectors {
Some(vectors) => { Some(vectors) => Some(
Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc)
} .map_err(InternalError::SerdeJson)?,
),
None => None, None => None,
}; };
@@ -220,7 +222,7 @@ fn entry_from_raw_value(
pub struct VectorDocumentFromVersions<'doc> { pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str, external_document_id: &'doc str,
vectors: RawMap<'doc>, vectors: RawMap<'doc, FxBuildHasher>,
embedders: &'doc EmbeddingConfigs, embedders: &'doc EmbeddingConfigs,
} }
@@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
) -> Result<Option<Self>> { ) -> Result<Option<Self>> {
let document = DocumentFromVersions::new(versions); let document = DocumentFromVersions::new(versions);
if let Some(vectors_field) = document.vectors_field()? { if let Some(vectors_field) = document.vectors_field()? {
let vectors = let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump)
RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; .map_err(UserError::SerdeJson)?;
Ok(Some(Self { external_document_id, vectors, embedders })) Ok(Some(Self { external_document_id, vectors, embedders }))
} else { } else {
Ok(None) Ok(None)