Merge remote-tracking branch 'origin/main' into tmp-release-v1.15.1

This commit is contained in:
Clément Renault
2025-06-12 10:21:07 +02:00
58 changed files with 2312 additions and 1756 deletions

View File

@@ -33,7 +33,6 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError};
use crate::index::{PrefixSearch, PrefixSettings};
use crate::progress::Progress;
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
@@ -228,24 +227,7 @@ where
let possible_embedding_mistakes =
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
let backup_pool;
let pool = match self.indexer_config.thread_pool {
Some(ref pool) => pool,
None => {
// We initialize a backup pool with the default
// settings if none have already been set.
#[allow(unused_mut)]
let mut pool_builder = ThreadPoolNoAbortBuilder::new();
#[cfg(test)]
{
pool_builder = pool_builder.num_threads(1);
}
backup_pool = pool_builder.build()?;
&backup_pool
}
};
let pool = &self.indexer_config.thread_pool;
// create LMDB writer channel
let (lmdb_writer_sx, lmdb_writer_rx): (

View File

@@ -1,7 +1,7 @@
use grenad::CompressionType;
use super::GrenadParameters;
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
use crate::{thread_pool_no_abort::ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
#[derive(Debug)]
pub struct IndexerConfig {
@@ -9,9 +9,10 @@ pub struct IndexerConfig {
pub max_nb_chunks: Option<usize>,
pub documents_chunk_size: Option<usize>,
pub max_memory: Option<usize>,
pub max_threads: Option<usize>,
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>,
pub thread_pool: Option<ThreadPoolNoAbort>,
pub thread_pool: ThreadPoolNoAbort,
pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool,
}
@@ -27,16 +28,39 @@ impl IndexerConfig {
}
}
/// By default use only 1 thread for indexing in tests
#[cfg(test)]
pub fn default_thread_pool_and_threads() -> (ThreadPoolNoAbort, Option<usize>) {
let pool = ThreadPoolNoAbortBuilder::new_for_indexing()
.num_threads(1)
.build()
.expect("failed to build default rayon thread pool");
(pool, Some(1))
}
#[cfg(not(test))]
pub fn default_thread_pool_and_threads() -> (ThreadPoolNoAbort, Option<usize>) {
let pool = ThreadPoolNoAbortBuilder::new_for_indexing()
.build()
.expect("failed to build default rayon thread pool");
(pool, None)
}
impl Default for IndexerConfig {
fn default() -> Self {
let (thread_pool, max_threads) = default_thread_pool_and_threads();
Self {
max_threads,
thread_pool,
log_every_n: None,
max_nb_chunks: None,
documents_chunk_size: None,
max_memory: None,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
thread_pool: None,
max_positions_per_attributes: None,
skip_index_budget: false,
}

View File

@@ -5,7 +5,7 @@ pub use self::concurrent_available_ids::ConcurrentAvailableIds;
pub use self::facet::bulk::FacetsUpdateBulk;
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
pub use self::index_documents::*;
pub use self::indexer_config::IndexerConfig;
pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig};
pub use self::new::ChannelCongestion;
pub use self::settings::{validate_embedding_settings, Setting, Settings};
pub use self::update_step::UpdateIndexingStep;

View File

@@ -8,7 +8,7 @@ use hashbrown::HashMap;
use serde_json::Value;
use super::super::cache::BalancedCaches;
use super::facet_document::extract_document_facets;
use super::facet_document::{extract_document_facets, extract_geo_document};
use super::FacetKind;
use crate::fields_ids_map::metadata::Metadata;
use crate::filterable_attributes_rules::match_faceted_field;
@@ -90,17 +90,12 @@ impl FacetedDocidsExtractor {
let mut cached_sorter = context.data.borrow_mut_or_yield();
let mut del_add_facet_value = DelAddFacetValue::new(&context.doc_alloc);
let docid = document_change.docid();
let res = match document_change {
DocumentChange::Deletion(inner) => extract_document_facets(
inner.current(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
is_geo_enabled,
&mut |fid, meta, depth, value| {
// Using a macro avoid borrowing the parameters as mutable in both closures at
// the same time by postponing their creation
macro_rules! facet_fn {
(del) => {
|fid: FieldId, meta: Metadata, depth: perm_json_p::Depth, value: &Value| {
Self::facet_fn_with_options(
&context.doc_alloc,
cached_sorter.deref_mut(),
@@ -114,91 +109,10 @@ impl FacetedDocidsExtractor {
depth,
value,
)
},
),
DocumentChange::Update(inner) => {
let has_changed = inner.has_changed_for_fields(
&mut |field_name| {
match_faceted_field(
field_name,
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
)
},
rtxn,
index,
context.db_fields_ids_map,
)?;
let has_changed_for_geo_fields =
inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?;
if !has_changed && !has_changed_for_geo_fields {
return Ok(());
}
extract_document_facets(
inner.current(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
is_geo_enabled,
&mut |fid, meta, depth, value| {
Self::facet_fn_with_options(
&context.doc_alloc,
cached_sorter.deref_mut(),
BalancedCaches::insert_del_u32,
&mut del_add_facet_value,
DelAddFacetValue::insert_del,
docid,
fid,
meta,
filterable_attributes,
depth,
value,
)
},
)?;
extract_document_facets(
inner.merged(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
is_geo_enabled,
&mut |fid, meta, depth, value| {
Self::facet_fn_with_options(
&context.doc_alloc,
cached_sorter.deref_mut(),
BalancedCaches::insert_add_u32,
&mut del_add_facet_value,
DelAddFacetValue::insert_add,
docid,
fid,
meta,
filterable_attributes,
depth,
value,
)
},
)
}
DocumentChange::Insertion(inner) => extract_document_facets(
inner.inserted(),
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
is_geo_enabled,
&mut |fid, meta, depth, value| {
};
(add) => {
|fid: FieldId, meta: Metadata, depth: perm_json_p::Depth, value: &Value| {
Self::facet_fn_with_options(
&context.doc_alloc,
cached_sorter.deref_mut(),
@@ -212,12 +126,116 @@ impl FacetedDocidsExtractor {
depth,
value,
)
},
),
}
};
}
match document_change {
DocumentChange::Deletion(inner) => {
let mut del = facet_fn!(del);
extract_document_facets(
inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
&mut del,
)?;
if is_geo_enabled {
extract_geo_document(
inner.current(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
&mut del,
)?;
}
}
DocumentChange::Update(inner) => {
let has_changed_for_facets = inner.has_changed_for_fields(
&mut |field_name| {
match_faceted_field(
field_name,
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
)
},
rtxn,
index,
context.db_fields_ids_map,
)?;
// 1. Maybe update doc
if has_changed_for_facets {
extract_document_facets(
inner.current(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
&mut facet_fn!(del),
)?;
extract_document_facets(
inner.merged(rtxn, index, context.db_fields_ids_map)?,
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
&mut facet_fn!(add),
)?;
}
// 2. Maybe update geo
if is_geo_enabled
&& inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?
{
extract_geo_document(
inner.current(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
&mut facet_fn!(del),
)?;
extract_geo_document(
inner.merged(rtxn, index, context.db_fields_ids_map)?,
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
&mut facet_fn!(add),
)?;
}
}
DocumentChange::Insertion(inner) => {
let mut add = facet_fn!(add);
extract_document_facets(
inner.inserted(),
new_fields_ids_map.deref_mut(),
filterable_attributes,
sortable_fields,
asc_desc_fields,
distinct_field,
&mut add,
)?;
if is_geo_enabled {
extract_geo_document(
inner.inserted(),
inner.external_document_id(),
new_fields_ids_map.deref_mut(),
&mut add,
)?;
}
}
};
del_add_facet_value.send_data(docid, sender, &context.doc_alloc).unwrap();
res
Ok(())
}
#[allow(clippy::too_many_arguments)]

View File

@@ -15,13 +15,11 @@ use crate::{
#[allow(clippy::too_many_arguments)]
pub fn extract_document_facets<'doc>(
document: impl Document<'doc>,
external_document_id: &str,
field_id_map: &mut GlobalFieldsIdsMap,
filterable_attributes: &[FilterableAttributesRule],
sortable_fields: &HashSet<String>,
asc_desc_fields: &HashSet<String>,
distinct_field: &Option<String>,
is_geo_enabled: bool,
facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>,
) -> Result<()> {
// return the match result for the given field name.
@@ -101,17 +99,24 @@ pub fn extract_document_facets<'doc>(
}
}
if is_geo_enabled {
if let Some(geo_value) = document.geo_field()? {
if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? {
let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map
.id_with_metadata_or_insert("_geo.lat")
.zip(field_id_map.id_with_metadata_or_insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
Ok(())
}
facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?;
facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?;
}
pub fn extract_geo_document<'doc>(
document: impl Document<'doc>,
external_document_id: &str,
field_id_map: &mut GlobalFieldsIdsMap,
facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>,
) -> Result<()> {
if let Some(geo_value) = document.geo_field()? {
if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? {
let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map
.id_with_metadata_or_insert("_geo.lat")
.zip(field_id_map.id_with_metadata_or_insert("_geo.lng"))
.ok_or(UserError::AttributeLimitReached)?;
facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?;
facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?;
}
}

View File

@@ -336,7 +336,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.primary_key = Setting::Set(primary_key);
}
pub fn set_autorize_typos(&mut self, val: bool) {
pub fn set_authorize_typos(&mut self, val: bool) {
self.authorize_typos = Setting::Set(val);
}

View File

@@ -792,7 +792,7 @@ fn test_disable_typo() {
index
.update_settings_using_wtxn(&mut txn, |settings| {
settings.set_autorize_typos(false);
settings.set_authorize_typos(false);
})
.unwrap();