diff --git a/Cargo.lock b/Cargo.lock index ee17054d7..fe0c71813 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1075,6 +1075,7 @@ dependencies = [ "roaring 0.11.1", "steppe", "thiserror 2.0.12", + "zerometry", ] [[package]] @@ -1236,6 +1237,17 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "colored_json" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e35980a1b846f8e3e359fd18099172a0857140ba9230affc4f71348081e039b6" +dependencies = [ + "serde", + "serde_json", + "yansi", +] + [[package]] name = "concat-arrays" version = "0.1.2" @@ -3372,6 +3384,47 @@ dependencies = [ "regex", ] +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", + "windows-sys 0.52.0", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.33" @@ -4828,6 +4881,15 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.2" @@ -6001,13 +6063,14 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "steppe" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4eff5a5a26ccdbe5122027ce9bd395cf7e7d7e6297a40fa0ba64eaf115db4fb" +version = "0.4.0" dependencies = [ + "colored_json", "convert_case 0.8.0", "indexmap", + "jiff", "serde", + "serde_json", ] [[package]] @@ -7492,6 +7555,12 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "yaup" version = "0.3.1" @@ -7612,6 +7681,16 @@ dependencies = [ "syn 2.0.101", ] +[[package]] +name = "zerometry" +version = "0.1.0" +dependencies = [ + "bytemuck", + "byteorder", + "geo", + "geo-types", +] + [[package]] name = "zerotrie" version = "0.2.2" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index b9b13f574..849d5a2c9 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -20,8 +20,8 @@ bytemuck = { version = "1.23.1", features = ["extern_crate_alloc"] } byteorder = "1.5.0" # cellulite = { git = "https://github.com/irevoire/cellulite", branch = "main"} cellulite = { path = "../../../cellulite" } -# steppe = { path = "../../../steppe" } -steppe = "0.3.0" +steppe = { path = "../../../steppe" } +# steppe = "0.3.0" charabia = { version = "0.9.6", default-features = false } concat-arrays = "0.1.2" convert_case = "0.8.0" diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index ee47f458a..7bd8f9f98 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -184,7 +184,7 @@ pub struct Index { pub vector_arroy: arroy::Database, /// Geo store based on celluliteā„¢. - pub cellulite: cellulite::Database, + pub cellulite: cellulite::Cellulite, /// Maps the document id to the document as an obkv store. pub(crate) documents: Database, @@ -242,7 +242,7 @@ impl Index { let embedder_category_id = env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; - let cellulite = env.create_database(&mut wtxn, Some(CELLULITE))?; + let cellulite = cellulite::Cellulite::create_from_env(&env, &mut wtxn)?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 0a794c62d..374db87a5 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -794,8 +794,7 @@ impl<'a> Filter<'a> { ), Vec::new(), ); - let cellulite = cellulite::Cellulite::new(index.cellulite); - let result = cellulite + let result = index.cellulite .in_shape(rtxn, &polygon.into(), &mut |_| ()) .map_err(InternalError::CelluliteError)?; // TODO: Remove once we update roaring diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 5d56dd1f0..c4ff190e1 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -540,8 +540,7 @@ where } tracing::warn!("Building cellulite"); - let cellulite = cellulite::Cellulite::new(self.index.cellulite); - cellulite.build(self.wtxn, &Progress::default())?; + self.index.cellulite.build(self.wtxn, &Progress::default())?; self.execute_prefix_databases( word_docids.map(MergerBuilder::build), diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index ae4df8af9..caa2de23a 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -629,8 +629,6 @@ pub(crate) fn write_typed_chunk_into_index( } let merger = builder.build(); - let cellulite = cellulite::Cellulite::new(index.cellulite); - let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { // convert the key back to a u32 (4 bytes) @@ -639,14 +637,14 @@ pub(crate) fn write_typed_chunk_into_index( let deladd_obkv = KvReaderDelAdd::from_slice(value); if let Some(_value) = deladd_obkv.get(DelAdd::Deletion) { - cellulite.delete(wtxn, docid)?; + index.cellulite.delete(wtxn, docid)?; } if let Some(value) = deladd_obkv.get(DelAdd::Addition) { tracing::warn!("Adding one geojson to cellulite"); let geojson = geojson::GeoJson::from_reader(value).map_err(UserError::SerdeJson)?; - cellulite + index.cellulite .add(wtxn, docid, &geojson) .map_err(InternalError::CelluliteError)?; } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index ba36faf1c..e1707fd93 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -469,7 +469,6 @@ pub enum Database { FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, VectorEmbedderCategoryId, - Cellulite, } impl Database { @@ -492,7 +491,6 @@ impl Database { Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(), - Database::Cellulite => index.cellulite.remap_types(), } } @@ -515,7 +513,6 @@ impl Database { Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID, - Database::Cellulite => db_name::CELLULITE, } } } diff --git a/crates/milli/src/update/new/extract/geo/cellulite.rs b/crates/milli/src/update/new/extract/geo/cellulite.rs index 80e6a2884..656fd34c0 100644 --- a/crates/milli/src/update/new/extract/geo/cellulite.rs +++ b/crates/milli/src/update/new/extract/geo/cellulite.rs @@ -1,24 +1,21 @@ use std::cell::RefCell; use std::fs::File; -use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; +use std::io::{self, BufReader, BufWriter, ErrorKind, Seek as _, Write as _}; use std::str::FromStr; -use std::{iter, mem, result}; +use std::{iter, mem}; use bumpalo::Bump; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use geojson::GeoJson; use heed::RoTxn; -use serde_json::value::RawValue; -use serde_json::Value; -use crate::error::GeoError; use crate::update::new::document::{Document, DocumentContext}; use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::thread_local::MostlySend; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result, UserError}; +use crate::{DocumentId, Index, Result, UserError}; pub struct GeoJsonExtractor { grenad_parameters: GrenadParameters, @@ -38,25 +35,6 @@ impl GeoJsonExtractor { } } -/* -#[derive(Pod, Zeroable, Copy, Clone)] -#[repr(C, packed)] -pub struct ExtractedGeoPoint { - pub docid: DocumentId, - pub lat_lng: [f64; 2], -} - -impl From for GeoPoint { - /// Converts the latitude and longitude back to an xyz GeoPoint. - fn from(value: ExtractedGeoPoint) -> Self { - let [lat, lng] = value.lat_lng; - let point = [lat, lng]; - let xyz_point = lat_lng_to_xyz(&point); - GeoPoint::new(xyz_point, (value.docid, point)) - } -} -*/ - pub struct GeoJsonExtractorData<'extractor> { /// The set of documents ids that were removed. If a document sees its geo /// point being updated, we first put it in the deleted and then in the inserted. @@ -265,94 +243,3 @@ impl<'extractor> Extractor<'extractor> for GeoJsonExtractor { Ok(()) } } - -/// Extracts and validates the latitude and latitude from a document geo field. -/// -/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`. -pub fn extract_geo_coordinates( - external_id: &str, - raw_value: &RawValue, -) -> Result> { - let mut geo = match serde_json::from_str(raw_value.get()).map_err(InternalError::SerdeJson)? { - Value::Null => return Ok(None), - Value::Object(map) => map, - value => { - return Err(Box::new(GeoError::NotAnObject { - document_id: Value::from(external_id), - value, - }) - .into()) - } - }; - - let [lat, lng] = match (geo.remove("lat"), geo.remove("lng")) { - (Some(lat), Some(lng)) => { - if geo.is_empty() { - [lat, lng] - } else { - return Err(Box::new(GeoError::UnexpectedExtraFields { - document_id: Value::from(external_id), - value: Value::from(geo), - }) - .into()); - } - } - (Some(_), None) => { - return Err(Box::new(GeoError::MissingLongitude { - document_id: Value::from(external_id), - }) - .into()) - } - (None, Some(_)) => { - return Err(Box::new(GeoError::MissingLatitude { - document_id: Value::from(external_id), - }) - .into()) - } - (None, None) => { - return Err(Box::new(GeoError::MissingLatitudeAndLongitude { - document_id: Value::from(external_id), - }) - .into()) - } - }; - - match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { - (Ok(lat), Ok(lng)) => Ok(Some([lat, lng])), - (Ok(_), Err(value)) => { - Err(Box::new(GeoError::BadLongitude { document_id: Value::from(external_id), value }) - .into()) - } - (Err(value), Ok(_)) => { - Err(Box::new(GeoError::BadLatitude { document_id: Value::from(external_id), value }) - .into()) - } - (Err(lat), Err(lng)) => Err(Box::new(GeoError::BadLatitudeAndLongitude { - document_id: Value::from(external_id), - lat, - lng, - }) - .into()), - } -} - -/// Extracts and validate that a serde JSON Value is actually a finite f64. -pub fn extract_finite_float_from_value(value: Value) -> result::Result { - let number = match value { - Value::Number(ref n) => match n.as_f64() { - Some(number) => number, - None => return Err(value), - }, - Value::String(ref s) => match s.parse::() { - Ok(number) => number, - Err(_) => return Err(value), - }, - value => return Err(value), - }; - - if number.is_finite() { - Ok(number) - } else { - Err(value) - } -} diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index f9459dbc5..a2396782f 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -163,9 +163,7 @@ where indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); - - let cellulite = cellulite::Cellulite::new(index.cellulite); - cellulite.build(wtxn, indexing_context.progress)?; + index.cellulite.build(wtxn, indexing_context.progress)?; pool.install(|| { build_vectors( diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 6b5497f77..9dfbc90da 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -32,7 +32,6 @@ pub fn write_to_db( let _entered = span.enter(); let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let mut _entered_post_merge = None; - let cellulite = cellulite::Cellulite::new(index.cellulite); while let Some(action) = writer_receiver.recv_action() { if _entered_post_merge.is_none() && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) @@ -76,10 +75,10 @@ pub fn write_to_db( ReceiverAction::GeoJson(docid, geojson) => { match geojson { Some(geojson) => { - cellulite.add(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?; + index.cellulite.add(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?; } None => { - cellulite.delete(wtxn, docid).map_err(InternalError::CelluliteError)?; + index.cellulite.delete(wtxn, docid).map_err(InternalError::CelluliteError)?; } } } diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 670675c6b..811d58673 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -82,6 +82,7 @@ where let mut frozen = data.into_inner().freeze()?; for result in frozen.iter_and_clear_removed()? { let extracted_geo_point = result.map_err(InternalError::SerdeJson)?; + /// Fix that todo!("We must send the docid instead of the geojson"); /* let removed = cellulite.remove(&GeoJsonPoint::from(extracted_geo_point));