From dee31c279f547a61f1aa5066c036aa3135367a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Aug 2025 15:55:46 +0200 Subject: [PATCH] First version of Hannoy dumpless upgrade --- Cargo.lock | 34 +++++----- Cargo.toml | 2 +- crates/milli/src/update/upgrade/mod.rs | 7 +- crates/milli/src/update/upgrade/v1_18.rs | 34 ++++++++++ crates/milli/src/vector/mod.rs | 85 +++++++++++++++++++++--- 5 files changed, 134 insertions(+), 28 deletions(-) create mode 100644 crates/milli/src/update/upgrade/v1_18.rs diff --git a/Cargo.lock b/Cargo.lock index 061bd95b3..c735a7694 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -580,7 +580,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2 [[package]] name = "benchmarks" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "bumpalo", @@ -770,7 +770,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "time", @@ -1774,7 +1774,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "big_s", @@ -2006,7 +2006,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.17.1" +version = "1.18.0" dependencies = [ "tempfile", "thiserror 2.0.12", @@ -2028,7 +2028,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.17.1" +version = "1.18.0" dependencies = [ "insta", "levenshtein_automata", @@ -2050,7 +2050,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.17.1" +version = "1.18.0" dependencies = [ "criterion", "serde_json", @@ -2195,7 +2195,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.17.1" +version = "1.18.0" dependencies = [ "arbitrary", "bumpalo", @@ -3017,7 +3017,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "backoff", @@ -3253,7 +3253,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.17.1" +version = "1.18.0" dependencies = [ "criterion", "serde_json", @@ -3747,7 +3747,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.17.1" +version = "1.18.0" dependencies = [ "insta", "md5", @@ -3758,7 +3758,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.17.1" +version = "1.18.0" dependencies = [ "actix-cors", "actix-http", @@ -3854,7 +3854,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.17.1" +version = "1.18.0" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3873,7 +3873,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.17.1" +version = "1.18.0" dependencies = [ "actix-web", "anyhow", @@ -3908,7 +3908,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "clap", @@ -3942,7 +3942,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.17.1" +version = "1.18.0" dependencies = [ "allocator-api2 0.3.0", "arroy", @@ -4523,7 +4523,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.17.1" +version = "1.18.0" dependencies = [ "big_s", "serde_json", @@ -7331,7 +7331,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.17.1" +version = "1.18.0" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index bc1c354b7..d68fa222c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ members = [ ] [workspace.package] -version = "1.17.1" +version = "1.18.0" authors = [ "Quentin de Quelen ", "Clément Renault ", diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index ecd1cec6c..22d1ca417 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -3,16 +3,19 @@ mod v1_13; mod v1_14; mod v1_15; mod v1_16; +mod v1_18; + use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; use v1_14::Latest_V1_13_To_Latest_V1_14; use v1_15::Latest_V1_14_To_Latest_V1_15; +use v1_16::Latest_V1_15_To_V1_16_0; use v1_16::Latest_V1_16_To_V1_17_0; +use v1_18::Latest_V1_17_To_V1_18_0; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::{Progress, VariableNameStep}; -use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0; use crate::{Index, InternalError, Result}; trait UpgradeIndex { @@ -36,6 +39,7 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[ &Latest_V1_14_To_Latest_V1_15 {}, &Latest_V1_15_To_V1_16_0 {}, &Latest_V1_16_To_V1_17_0 {}, + &Latest_V1_17_To_V1_18_0 {}, // This is the last upgrade function, it will be called when the index is up to date. // any other upgrade function should be added before this one. &ToCurrentNoOp {}, @@ -65,6 +69,7 @@ const fn start(from: (u32, u32, u32)) -> Option { (1, 15, _) => function_index!(6), (1, 16, _) => function_index!(7), (1, 17, _) => function_index!(8), + (1, 18, _) => function_index!(9), // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually // considering dumpless upgrade. (_major, _minor, _patch) => return None, diff --git a/crates/milli/src/update/upgrade/v1_18.rs b/crates/milli/src/update/upgrade/v1_18.rs new file mode 100644 index 000000000..e20696b84 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_18.rs @@ -0,0 +1,34 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::vector::VectorStore; +use crate::{Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct Latest_V1_17_To_V1_18_0(); + +impl UpgradeIndex for Latest_V1_17_To_V1_18_0 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + let embedding_configs = index.embedding_configs(); + for config in embedding_configs.embedding_configs(wtxn)? { + // TODO use the embedder name to display progress + let quantized = config.config.quantized(); + let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap(); + let vector_store = VectorStore::new(index.vector_store, embedder_id, quantized); + vector_store.convert_from_arroy(wtxn)?; + } + + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 18, 0) + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index bba617782..0a97a9bde 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -8,6 +8,7 @@ use hannoy::distances::{Cosine, Hamming}; use hannoy::ItemId; use heed::{RoTxn, RwTxn, Unspecified}; use ordered_float::OrderedFloat; +use rand::SeedableRng as _; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -64,12 +65,30 @@ impl VectorStore { self.embedder_index } + fn arroy_readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) + } + fn readers<'a, D: hannoy::Distance>( &'a self, rtxn: &'a RoTxn<'a>, db: hannoy::Database, ) -> impl Iterator, hannoy::Error>> + 'a { - hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match hannoy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { Ok(false) => Some(Ok(reader)), @@ -136,6 +155,46 @@ impl VectorStore { } } + pub fn convert_from_arroy(&self, wtxn: &mut RwTxn) -> crate::Result<()> { + if self.quantized { + let dimensions = self + .arroy_readers(wtxn, self.arroy_quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let mut rng = rand::rngs::StdRng::from_entropy(); + let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions); + let mut builder = writer.builder(&mut rng); + builder.prepare_arroy_conversion(wtxn)?; + builder.build::(wtxn)?; + } + + Ok(()) + } else { + let dimensions = self + .arroy_readers(wtxn, self.arroy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let mut rng = rand::rngs::StdRng::from_entropy(); + let writer = hannoy::Writer::new(self.angular_db(), index, dimensions); + let mut builder = writer.builder(&mut rng); + builder.prepare_arroy_conversion(wtxn)?; + builder.build::(wtxn)?; + } + + Ok(()) + } + } + #[allow(clippy::too_many_arguments)] pub fn build_and_quantize( &mut self, @@ -147,7 +206,7 @@ impl VectorStore { hannoy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { @@ -202,7 +261,7 @@ impl VectorStore { ) -> Result<(), hannoy::Error> { let dimension = embeddings.dimension(); for (index, vector) in - hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) { if self.quantized { hannoy::Writer::new(self.quantized_db(), index, dimension) @@ -238,7 +297,7 @@ impl VectorStore { ) -> Result<(), hannoy::Error> { let dimension = vector.len(); - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let writer = hannoy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -287,7 +346,7 @@ impl VectorStore { dimension: usize, item_id: hannoy::ItemId, ) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); writer.del_item(wtxn, item_id)?; @@ -387,7 +446,7 @@ impl VectorStore { ) -> Result { let dimension = vector.len(); - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let writer = hannoy::Writer::new(db, index, dimension); if writer.contains_item(wtxn, item_id)? { return writer.del_item(wtxn, item_id); @@ -397,7 +456,7 @@ impl VectorStore { } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(wtxn)? { @@ -421,7 +480,7 @@ impl VectorStore { dimension: usize, item: hannoy::ItemId, ) -> Result { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let contains = if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(rtxn)? { @@ -547,6 +606,14 @@ impl VectorStore { Ok(vectors) } + fn arroy_angular_db(&self) -> arroy::Database { + self.database.remap_types() + } + + fn arroy_quantized_db(&self) -> arroy::Database { + self.database.remap_types() + } + fn angular_db(&self) -> hannoy::Database { self.database.remap_data_type() } @@ -1230,7 +1297,7 @@ pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } -fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { +fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator { (0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id)) }