diff --git a/Cargo.lock b/Cargo.lock index fa73c52a1..ee17054d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -467,7 +467,7 @@ dependencies = [ "page_size", "rand 0.8.5", "rayon", - "roaring", + "roaring 0.10.12", "tempfile", "thiserror 2.0.12", "tracing", @@ -604,7 +604,7 @@ dependencies = [ "rand 0.8.5", "rand_chacha 0.3.1", "reqwest", - "roaring", + "roaring 0.10.12", "serde_json", "tempfile", ] @@ -1072,7 +1072,8 @@ dependencies = [ "h3o", "heed", "ordered-float 5.0.0", - "roaring", + "roaring 0.11.1", + "steppe", "thiserror 2.0.12", ] @@ -1808,7 +1809,7 @@ dependencies = [ "meilisearch-types", "once_cell", "regex", - "roaring", + "roaring 0.10.12", "serde", "serde_json", "tar", @@ -3183,7 +3184,7 @@ dependencies = [ "memmap2", "page_size", "rayon", - "roaring", + "roaring 0.10.12", "serde", "serde_json", "synchronoise", @@ -3197,9 +3198,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", "hashbrown 0.15.4", @@ -3954,7 +3955,7 @@ dependencies = [ "rayon", "regex", "reqwest", - "roaring", + "roaring 0.10.12", "rustls", "rustls-pemfile", "rustls-pki-types", @@ -4001,7 +4002,7 @@ dependencies = [ "maplit", "meilisearch-types", "rand 0.8.5", - "roaring", + "roaring 0.10.12", "serde", "serde_json", "sha2", @@ -4031,7 +4032,7 @@ dependencies = [ "meili-snap", "memmap2", "milli", - "roaring", + "roaring 0.10.12", "rustc-hash 2.1.1", "serde", "serde-cs", @@ -4137,7 +4138,7 @@ dependencies = [ "rand 0.8.5", "rayon", "rhai", - "roaring", + "roaring 0.10.12", "rstar", "rustc-hash 2.1.1", "serde", @@ -4146,6 +4147,7 @@ dependencies = [ "smallstr", "smallvec", "smartstring", + "steppe", "tempfile", "thiserror 2.0.12", "thread_local", @@ -5387,6 +5389,16 @@ dependencies = [ "serde", ] +[[package]] +name = "roaring" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f79304aff09c245934bb9558a215e53a4cfbbe6aa8ac2a79847be551264979" +dependencies = [ + "bytemuck", + "byteorder", +] + [[package]] name = "robust" version = "1.2.0" @@ -5987,6 +5999,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "steppe" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4eff5a5a26ccdbe5122027ce9bd395cf7e7d7e6297a40fa0ba64eaf115db4fb" +dependencies = [ + "convert_case 0.8.0", + "indexmap", + "serde", +] + [[package]] name = "strsim" version = "0.10.0" diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index cc651950c..d8d2f628a 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -509,6 +509,7 @@ impl ErrorCode for milli::Error { | UserError::DocumentEditionCompilationError(_) => { Code::EditDocumentsByFunctionError } + UserError::CelluliteError(_) => Code::Internal, } } } diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 2a5bb139b..b9b13f574 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -20,6 +20,8 @@ bytemuck = { version = "1.23.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" # cellulite = { git = "https://github.com/irevoire/cellulite", branch = "main"} cellulite = { path = "../../../cellulite" } +# steppe = { path = "../../../steppe" } +steppe = "0.3.0" charabia = { version = "0.9.6", default-features = false } concat-arrays = "0.1.2" convert_case = "0.8.0" diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 6ab1e5b9c..c9c39db18 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -99,6 +99,12 @@ pub enum SerializationError { InvalidNumberSerialization, } +impl From for Error { + fn from(error: cellulite::Error) -> Self { + Self::UserError(UserError::CelluliteError(error)) + } +} + #[derive(Error, Debug)] pub enum FieldIdMapMissingEntry { #[error("unknown field id {field_id} coming from the {process} process")] @@ -109,6 +115,8 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { + #[error(transparent)] + CelluliteError(#[from] cellulite::Error), #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 89b0a446b..266cf209d 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -6,7 +6,7 @@ use heed::RoTxn; use super::FieldsIdsMap; use crate::attribute_patterns::{match_field_legacy, PatternMatch}; -use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::constants::{RESERVED_GEOJSON_FIELD_NAME, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::{ is_faceted_by, FieldId, FilterableAttributesFeatures, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, Weight, @@ -24,6 +24,8 @@ pub struct Metadata { pub asc_desc: bool, /// The field is a geo field (`_geo`, `_geo.lat`, `_geo.lng`). pub geo: bool, + /// The field is a geo json field (`_geojson`). + pub geo_json: bool, /// The id of the localized attributes rule if the field is localized. pub localized_attributes_rule_id: Option, /// The id of the filterable attributes rule if the field is filterable. @@ -269,6 +271,7 @@ impl MetadataBuilder { distinct: false, asc_desc: false, geo: false, + geo_json: false, localized_attributes_rule_id: None, filterable_attributes_rule_id: None, }; @@ -295,6 +298,20 @@ impl MetadataBuilder { distinct: false, asc_desc: false, geo: true, + geo_json: false, + localized_attributes_rule_id: None, + filterable_attributes_rule_id, + }; + } + if match_field_legacy(RESERVED_GEOJSON_FIELD_NAME, field) == PatternMatch::Match { + debug_assert!(!sortable, "geojson fields should not be sortable"); + return Metadata { + searchable: None, + sortable, + distinct: false, + asc_desc: false, + geo: false, + geo_json: true, localized_attributes_rule_id: None, filterable_attributes_rule_id, }; @@ -328,6 +345,7 @@ impl MetadataBuilder { distinct, asc_desc, geo: false, + geo_json: false, localized_attributes_rule_id, filterable_attributes_rule_id, } diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 61c61cd49..91682e321 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -317,3 +317,27 @@ impl Step for arroy::SubStep { self.max } } + +// Integration with steppe + +impl steppe::Progress for Progress { + fn update(&self, sub_progress: impl steppe::Step) { + self.update_progress(Compat(sub_progress)); + } +} + +struct Compat(T); + +impl Step for Compat { + fn name(&self) -> Cow<'static, str> { + self.0.name().into() + } + + fn current(&self) -> u32 { + self.0.current().try_into().unwrap_or(u32::MAX) + } + + fn total(&self) -> u32 { + self.0.total().try_into().unwrap_or(u32::MAX) + } +} \ No newline at end of file diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index b0a24d530..0a794c62d 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -794,10 +794,12 @@ impl<'a> Filter<'a> { ), Vec::new(), ); - let cellulite = cellulite::Writer::new(index.cellulite); + let cellulite = cellulite::Cellulite::new(index.cellulite); let result = cellulite .in_shape(rtxn, &polygon.into(), &mut |_| ()) .map_err(InternalError::CelluliteError)?; + // TODO: Remove once we update roaring + let result = roaring::RoaringBitmap::from_iter(result.into_iter()); Ok(result) } else { Err(points[0][0].as_external_error(FilterError::AttributeNotFilterable { diff --git a/crates/milli/src/update/index_documents/extract/extract_geo_points.rs b/crates/milli/src/update/index_documents/extract/extract_geo_points.rs index 44df98a39..d9e79d6aa 100644 --- a/crates/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -10,7 +10,7 @@ use crate::error::GeoError; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::extract_finite_float_from_value; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; -use crate::{FieldId, InternalError, Result}; +use crate::{DocumentId, FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// @@ -158,7 +158,7 @@ pub fn extract_geojson( obkv.insert(DelAdd::Addition, geojson.to_string().as_bytes())?; } let bytes = obkv.into_inner()?; - writer.insert(docid_bytes, bytes)?; + writer.insert(&docid_bytes[0..std::mem::size_of::()], bytes)?; } } @@ -172,13 +172,15 @@ fn extract_geojson_field( document_id: impl Fn() -> Value, ) -> Result> { match settings.geojson_fid { - Some(fid) => { + Some(fid) if settings.filterable_attributes_rules.iter().any(|rule| rule.has_geojson()) => { let value = obkv.get(fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd)); // TODO: That's a user error, not an internal error Ok(value .map(|v| serde_json::from_slice(v).map_err(InternalError::SerdeJson)) .transpose()?) } - None => Ok(None), + _ => { + Ok(None) + } } } diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index e1c2f254c..1ff9f4b95 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -243,9 +243,9 @@ fn send_original_documents_data( let original_documents_chunk = original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - tracing::info!("Do we have a geojson"); - if settings_diff.run_geojson_indexing() { - tracing::info!("Yes we do"); + tracing::warn!("Do we have a geojson"); + if settings_diff.reindex_geojson() { + tracing::warn!("Yes we do"); let documents_chunk_cloned = original_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); let settings_diff = settings_diff.clone(); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f8cf79144..5d56dd1f0 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -539,6 +539,10 @@ where .map_err(InternalError::from)??; } + tracing::warn!("Building cellulite"); + let cellulite = cellulite::Cellulite::new(self.index.cellulite); + cellulite.build(self.wtxn, &Progress::default())?; + self.execute_prefix_databases( word_docids.map(MergerBuilder::build), exact_word_docids.map(MergerBuilder::build), diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e07483aff..70523a49d 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -820,7 +820,7 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We initialize the sorter with the user indexing settings. - let mut original_sorter = if settings_diff.reindex_vectors() { + let mut original_sorter = if settings_diff.reindex_vectors() || settings_diff.reindex_geojson() { Some(create_sorter( grenad::SortAlgorithm::Stable, KeepFirst, diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index b3015fa94..ae4df8af9 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -629,26 +629,25 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - let cellulite = index.cellulite; + let cellulite = cellulite::Cellulite::new(index.cellulite); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) + tracing::warn!("Key: {:?}, length: {}", key, key.len()); let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); let deladd_obkv = KvReaderDelAdd::from_slice(value); if let Some(_value) = deladd_obkv.get(DelAdd::Deletion) { - todo!("handle deletion"); - // cellulite.remove(&docid); + cellulite.delete(wtxn, docid)?; } if let Some(value) = deladd_obkv.get(DelAdd::Addition) { - tracing::info!("Adding one geojson to cellulite"); + tracing::warn!("Adding one geojson to cellulite"); let geojson = geojson::GeoJson::from_reader(value).map_err(UserError::SerdeJson)?; - let writer = cellulite::Writer::new(index.cellulite); - writer - .add_item(wtxn, docid, &geojson) + cellulite + .add(wtxn, docid, &geojson) .map_err(InternalError::CelluliteError)?; } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index a6ba3a919..f9459dbc5 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -163,6 +163,10 @@ where indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); + + let cellulite = cellulite::Cellulite::new(index.cellulite); + cellulite.build(wtxn, indexing_context.progress)?; + pool.install(|| { build_vectors( index, diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 24e32f0bc..4a5f80688 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -73,8 +73,8 @@ pub fn write_to_db( writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; } ReceiverAction::GeoJson(docid, geojson) => { - let cellulite = cellulite::Writer::new(index.cellulite); - cellulite.add_item(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?; + let cellulite = cellulite::Cellulite::new(index.cellulite); + cellulite.add(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?; } } diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 9866f7147..0b7be364e 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1767,7 +1767,7 @@ impl InnerIndexSettingsDiff { } pub fn any_reindexing_needed(&self) -> bool { - self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() + self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() || self.reindex_geojson() } pub fn reindex_searchable(&self) -> bool { @@ -1876,6 +1876,11 @@ impl InnerIndexSettingsDiff { !self.embedding_config_updates.is_empty() } + pub fn reindex_geojson(&self) -> bool { + self.old.filterable_attributes_rules.iter().any(|rule| rule.has_geojson()) + != self.new.filterable_attributes_rules.iter().any(|rule| rule.has_geojson()) + } + pub fn settings_update_only(&self) -> bool { self.settings_update_only } @@ -1886,8 +1891,6 @@ impl InnerIndexSettingsDiff { } pub fn run_geojson_indexing(&self) -> bool { - tracing::info!("old.geojson_fid: {:?}", self.old.geojson_fid); - tracing::info!("new.geojson_fid: {:?}", self.new.geojson_fid); self.old.geojson_fid != self.new.geojson_fid || (!self.settings_update_only && self.new.geojson_fid.is_some()) }