update to the latest version of cellulite and steppe

This commit is contained in:
Tamo
2025-07-22 16:24:49 +02:00
parent 44dc64accb
commit e923154c90
11 changed files with 97 additions and 140 deletions

85
Cargo.lock generated
View File

@ -1075,6 +1075,7 @@ dependencies = [
"roaring 0.11.1", "roaring 0.11.1",
"steppe", "steppe",
"thiserror 2.0.12", "thiserror 2.0.12",
"zerometry",
] ]
[[package]] [[package]]
@ -1236,6 +1237,17 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "colored_json"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e35980a1b846f8e3e359fd18099172a0857140ba9230affc4f71348081e039b6"
dependencies = [
"serde",
"serde_json",
"yansi",
]
[[package]] [[package]]
name = "concat-arrays" name = "concat-arrays"
version = "0.1.2" version = "0.1.2"
@ -3372,6 +3384,47 @@ dependencies = [
"regex", "regex",
] ]
[[package]]
name = "jiff"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49"
dependencies = [
"jiff-static",
"jiff-tzdb-platform",
"log",
"portable-atomic",
"portable-atomic-util",
"serde",
"windows-sys 0.52.0",
]
[[package]]
name = "jiff-static"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.101",
]
[[package]]
name = "jiff-tzdb"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524"
[[package]]
name = "jiff-tzdb-platform"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8"
dependencies = [
"jiff-tzdb",
]
[[package]] [[package]]
name = "jobserver" name = "jobserver"
version = "0.1.33" version = "0.1.33"
@ -4828,6 +4881,15 @@ version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
[[package]]
name = "portable-atomic-util"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
dependencies = [
"portable-atomic",
]
[[package]] [[package]]
name = "potential_utf" name = "potential_utf"
version = "0.1.2" version = "0.1.2"
@ -6001,13 +6063,14 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]] [[package]]
name = "steppe" name = "steppe"
version = "0.3.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4eff5a5a26ccdbe5122027ce9bd395cf7e7d7e6297a40fa0ba64eaf115db4fb"
dependencies = [ dependencies = [
"colored_json",
"convert_case 0.8.0", "convert_case 0.8.0",
"indexmap", "indexmap",
"jiff",
"serde", "serde",
"serde_json",
] ]
[[package]] [[package]]
@ -7492,6 +7555,12 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd"
[[package]]
name = "yansi"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049"
[[package]] [[package]]
name = "yaup" name = "yaup"
version = "0.3.1" version = "0.3.1"
@ -7612,6 +7681,16 @@ dependencies = [
"syn 2.0.101", "syn 2.0.101",
] ]
[[package]]
name = "zerometry"
version = "0.1.0"
dependencies = [
"bytemuck",
"byteorder",
"geo",
"geo-types",
]
[[package]] [[package]]
name = "zerotrie" name = "zerotrie"
version = "0.2.2" version = "0.2.2"

View File

@ -20,8 +20,8 @@ bytemuck = { version = "1.23.1", features = ["extern_crate_alloc"] }
byteorder = "1.5.0" byteorder = "1.5.0"
# cellulite = { git = "https://github.com/irevoire/cellulite", branch = "main"} # cellulite = { git = "https://github.com/irevoire/cellulite", branch = "main"}
cellulite = { path = "../../../cellulite" } cellulite = { path = "../../../cellulite" }
# steppe = { path = "../../../steppe" } steppe = { path = "../../../steppe" }
steppe = "0.3.0" # steppe = "0.3.0"
charabia = { version = "0.9.6", default-features = false } charabia = { version = "0.9.6", default-features = false }
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
convert_case = "0.8.0" convert_case = "0.8.0"

View File

@ -184,7 +184,7 @@ pub struct Index {
pub vector_arroy: arroy::Database<Unspecified>, pub vector_arroy: arroy::Database<Unspecified>,
/// Geo store based on cellulite™. /// Geo store based on cellulite™.
pub cellulite: cellulite::Database, pub cellulite: cellulite::Cellulite,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>, pub(crate) documents: Database<BEU32, ObkvCodec>,
@ -242,7 +242,7 @@ impl Index {
let embedder_category_id = let embedder_category_id =
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
let cellulite = env.create_database(&mut wtxn, Some(CELLULITE))?; let cellulite = cellulite::Cellulite::create_from_env(&env, &mut wtxn)?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;

View File

@ -794,8 +794,7 @@ impl<'a> Filter<'a> {
), ),
Vec::new(), Vec::new(),
); );
let cellulite = cellulite::Cellulite::new(index.cellulite); let result = index.cellulite
let result = cellulite
.in_shape(rtxn, &polygon.into(), &mut |_| ()) .in_shape(rtxn, &polygon.into(), &mut |_| ())
.map_err(InternalError::CelluliteError)?; .map_err(InternalError::CelluliteError)?;
// TODO: Remove once we update roaring // TODO: Remove once we update roaring

View File

@ -540,8 +540,7 @@ where
} }
tracing::warn!("Building cellulite"); tracing::warn!("Building cellulite");
let cellulite = cellulite::Cellulite::new(self.index.cellulite); self.index.cellulite.build(self.wtxn, &Progress::default())?;
cellulite.build(self.wtxn, &Progress::default())?;
self.execute_prefix_databases( self.execute_prefix_databases(
word_docids.map(MergerBuilder::build), word_docids.map(MergerBuilder::build),

View File

@ -629,8 +629,6 @@ pub(crate) fn write_typed_chunk_into_index(
} }
let merger = builder.build(); let merger = builder.build();
let cellulite = cellulite::Cellulite::new(index.cellulite);
let mut iter = merger.into_stream_merger_iter()?; let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? { while let Some((key, value)) = iter.next()? {
// convert the key back to a u32 (4 bytes) // convert the key back to a u32 (4 bytes)
@ -639,14 +637,14 @@ pub(crate) fn write_typed_chunk_into_index(
let deladd_obkv = KvReaderDelAdd::from_slice(value); let deladd_obkv = KvReaderDelAdd::from_slice(value);
if let Some(_value) = deladd_obkv.get(DelAdd::Deletion) { if let Some(_value) = deladd_obkv.get(DelAdd::Deletion) {
cellulite.delete(wtxn, docid)?; index.cellulite.delete(wtxn, docid)?;
} }
if let Some(value) = deladd_obkv.get(DelAdd::Addition) { if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
tracing::warn!("Adding one geojson to cellulite"); tracing::warn!("Adding one geojson to cellulite");
let geojson = let geojson =
geojson::GeoJson::from_reader(value).map_err(UserError::SerdeJson)?; geojson::GeoJson::from_reader(value).map_err(UserError::SerdeJson)?;
cellulite index.cellulite
.add(wtxn, docid, &geojson) .add(wtxn, docid, &geojson)
.map_err(InternalError::CelluliteError)?; .map_err(InternalError::CelluliteError)?;
} }

View File

@ -469,7 +469,6 @@ pub enum Database {
FieldIdDocidFacetStrings, FieldIdDocidFacetStrings,
FieldIdDocidFacetF64s, FieldIdDocidFacetF64s,
VectorEmbedderCategoryId, VectorEmbedderCategoryId,
Cellulite,
} }
impl Database { impl Database {
@ -492,7 +491,6 @@ impl Database {
Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(),
Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(),
Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(), Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(),
Database::Cellulite => index.cellulite.remap_types(),
} }
} }
@ -515,7 +513,6 @@ impl Database {
Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS,
Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S,
Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID, Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID,
Database::Cellulite => db_name::CELLULITE,
} }
} }
} }

View File

@ -1,24 +1,21 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; use std::io::{self, BufReader, BufWriter, ErrorKind, Seek as _, Write as _};
use std::str::FromStr; use std::str::FromStr;
use std::{iter, mem, result}; use std::{iter, mem};
use bumpalo::Bump; use bumpalo::Bump;
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use geojson::GeoJson; use geojson::GeoJson;
use heed::RoTxn; use heed::RoTxn;
use serde_json::value::RawValue;
use serde_json::Value;
use crate::error::GeoError;
use crate::update::new::document::{Document, DocumentContext}; use crate::update::new::document::{Document, DocumentContext};
use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::indexer::document_changes::Extractor;
use crate::update::new::ref_cell_ext::RefCellExt as _; use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::thread_local::MostlySend; use crate::update::new::thread_local::MostlySend;
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result, UserError}; use crate::{DocumentId, Index, Result, UserError};
pub struct GeoJsonExtractor { pub struct GeoJsonExtractor {
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
@ -38,25 +35,6 @@ impl GeoJsonExtractor {
} }
} }
/*
#[derive(Pod, Zeroable, Copy, Clone)]
#[repr(C, packed)]
pub struct ExtractedGeoPoint {
pub docid: DocumentId,
pub lat_lng: [f64; 2],
}
impl From<ExtractedGeoPoint> for GeoPoint {
/// Converts the latitude and longitude back to an xyz GeoPoint.
fn from(value: ExtractedGeoPoint) -> Self {
let [lat, lng] = value.lat_lng;
let point = [lat, lng];
let xyz_point = lat_lng_to_xyz(&point);
GeoPoint::new(xyz_point, (value.docid, point))
}
}
*/
pub struct GeoJsonExtractorData<'extractor> { pub struct GeoJsonExtractorData<'extractor> {
/// The set of documents ids that were removed. If a document sees its geo /// The set of documents ids that were removed. If a document sees its geo
/// point being updated, we first put it in the deleted and then in the inserted. /// point being updated, we first put it in the deleted and then in the inserted.
@ -265,94 +243,3 @@ impl<'extractor> Extractor<'extractor> for GeoJsonExtractor {
Ok(()) Ok(())
} }
} }
/// Extracts and validates the latitude and latitude from a document geo field.
///
/// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`.
pub fn extract_geo_coordinates(
external_id: &str,
raw_value: &RawValue,
) -> Result<Option<[f64; 2]>> {
let mut geo = match serde_json::from_str(raw_value.get()).map_err(InternalError::SerdeJson)? {
Value::Null => return Ok(None),
Value::Object(map) => map,
value => {
return Err(Box::new(GeoError::NotAnObject {
document_id: Value::from(external_id),
value,
})
.into())
}
};
let [lat, lng] = match (geo.remove("lat"), geo.remove("lng")) {
(Some(lat), Some(lng)) => {
if geo.is_empty() {
[lat, lng]
} else {
return Err(Box::new(GeoError::UnexpectedExtraFields {
document_id: Value::from(external_id),
value: Value::from(geo),
})
.into());
}
}
(Some(_), None) => {
return Err(Box::new(GeoError::MissingLongitude {
document_id: Value::from(external_id),
})
.into())
}
(None, Some(_)) => {
return Err(Box::new(GeoError::MissingLatitude {
document_id: Value::from(external_id),
})
.into())
}
(None, None) => {
return Err(Box::new(GeoError::MissingLatitudeAndLongitude {
document_id: Value::from(external_id),
})
.into())
}
};
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
(Ok(lat), Ok(lng)) => Ok(Some([lat, lng])),
(Ok(_), Err(value)) => {
Err(Box::new(GeoError::BadLongitude { document_id: Value::from(external_id), value })
.into())
}
(Err(value), Ok(_)) => {
Err(Box::new(GeoError::BadLatitude { document_id: Value::from(external_id), value })
.into())
}
(Err(lat), Err(lng)) => Err(Box::new(GeoError::BadLatitudeAndLongitude {
document_id: Value::from(external_id),
lat,
lng,
})
.into()),
}
}
/// Extracts and validate that a serde JSON Value is actually a finite f64.
pub fn extract_finite_float_from_value(value: Value) -> result::Result<f64, Value> {
let number = match value {
Value::Number(ref n) => match n.as_f64() {
Some(number) => number,
None => return Err(value),
},
Value::String(ref s) => match s.parse::<f64>() {
Ok(number) => number,
Err(_) => return Err(value),
},
value => return Err(value),
};
if number.is_finite() {
Ok(number)
} else {
Err(value)
}
}

View File

@ -163,9 +163,7 @@ where
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
index.cellulite.build(wtxn, indexing_context.progress)?;
let cellulite = cellulite::Cellulite::new(index.cellulite);
cellulite.build(wtxn, indexing_context.progress)?;
pool.install(|| { pool.install(|| {
build_vectors( build_vectors(

View File

@ -32,7 +32,6 @@ pub fn write_to_db(
let _entered = span.enter(); let _entered = span.enter();
let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let span = tracing::trace_span!(target: "indexing::write_db", "post_merge");
let mut _entered_post_merge = None; let mut _entered_post_merge = None;
let cellulite = cellulite::Cellulite::new(index.cellulite);
while let Some(action) = writer_receiver.recv_action() { while let Some(action) = writer_receiver.recv_action() {
if _entered_post_merge.is_none() if _entered_post_merge.is_none()
&& finished_extraction.load(std::sync::atomic::Ordering::Relaxed) && finished_extraction.load(std::sync::atomic::Ordering::Relaxed)
@ -76,10 +75,10 @@ pub fn write_to_db(
ReceiverAction::GeoJson(docid, geojson) => { ReceiverAction::GeoJson(docid, geojson) => {
match geojson { match geojson {
Some(geojson) => { Some(geojson) => {
cellulite.add(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?; index.cellulite.add(wtxn, docid, &geojson).map_err(InternalError::CelluliteError)?;
} }
None => { None => {
cellulite.delete(wtxn, docid).map_err(InternalError::CelluliteError)?; index.cellulite.delete(wtxn, docid).map_err(InternalError::CelluliteError)?;
} }
} }
} }

View File

@ -82,6 +82,7 @@ where
let mut frozen = data.into_inner().freeze()?; let mut frozen = data.into_inner().freeze()?;
for result in frozen.iter_and_clear_removed()? { for result in frozen.iter_and_clear_removed()? {
let extracted_geo_point = result.map_err(InternalError::SerdeJson)?; let extracted_geo_point = result.map_err(InternalError::SerdeJson)?;
/// Fix that
todo!("We must send the docid instead of the geojson"); todo!("We must send the docid instead of the geojson");
/* /*
let removed = cellulite.remove(&GeoJsonPoint::from(extracted_geo_point)); let removed = cellulite.remove(&GeoJsonPoint::from(extracted_geo_point));