From 8a01b39f11c35d4dc7e9a01306232c7ca9c9ee65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 7 Aug 2025 15:55:46 +0200 Subject: [PATCH] First version of Hannoy dumpless upgrade --- Cargo.lock | 36 +++++----- Cargo.toml | 2 +- crates/index-scheduler/src/upgrade/mod.rs | 1 + crates/milli/src/update/upgrade/mod.rs | 7 +- crates/milli/src/update/upgrade/v1_18.rs | 34 ++++++++++ crates/milli/src/vector/mod.rs | 83 ++++++++++++++++++++--- 6 files changed, 134 insertions(+), 29 deletions(-) create mode 100644 crates/milli/src/update/upgrade/v1_18.rs diff --git a/Cargo.lock b/Cargo.lock index c14396ddd..a79fc4775 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -580,7 +580,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2 [[package]] name = "benchmarks" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "bumpalo", @@ -770,7 +770,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "time", @@ -1774,7 +1774,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "big_s", @@ -2006,7 +2006,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.16.0" +version = "1.18.0" dependencies = [ "tempfile", "thiserror 2.0.12", @@ -2028,7 +2028,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.16.0" +version = "1.18.0" dependencies = [ "insta", "nom", @@ -2049,7 +2049,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.16.0" +version = "1.18.0" dependencies = [ "criterion", "serde_json", @@ -2194,7 +2194,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.16.0" +version = "1.18.0" dependencies = [ "arbitrary", "bumpalo", @@ -2603,7 +2603,7 @@ dependencies = [ [[package]] name = "hannoy" version = "0.0.2" -source = "git+https://github.com/nnethercott/hannoy?branch=main#93a24c4cdf712152c90d27a2898715f22942c35c" +source = "git+https://github.com/nnethercott/hannoy?branch=main#d7097b5214c211f5d2bb9d2643f3d9fb8ccb03e2" dependencies = [ "bytemuck", "byteorder", @@ -3015,7 +3015,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "backoff", @@ -3251,7 +3251,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.16.0" +version = "1.18.0" dependencies = [ "criterion", "serde_json", @@ -3745,7 +3745,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.16.0" +version = "1.18.0" dependencies = [ "insta", "md5", @@ -3756,7 +3756,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.16.0" +version = "1.18.0" dependencies = [ "actix-cors", "actix-http", @@ -3852,7 +3852,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.16.0" +version = "1.18.0" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3871,7 +3871,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.16.0" +version = "1.18.0" dependencies = [ "actix-web", "anyhow", @@ -3906,7 +3906,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "clap", @@ -3940,7 +3940,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.16.0" +version = "1.18.0" dependencies = [ "allocator-api2 0.3.0", "arroy", @@ -4509,7 +4509,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.16.0" +version = "1.18.0" dependencies = [ "big_s", "serde_json", @@ -7307,7 +7307,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.16.0" +version = "1.18.0" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index 3e57563b6..f8dbc1ccc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.16.0" +version = "1.18.0" authors = [ "Quentin de Quelen ", "Clément Renault ", diff --git a/crates/index-scheduler/src/upgrade/mod.rs b/crates/index-scheduler/src/upgrade/mod.rs index 2053caa92..a749b31d5 100644 --- a/crates/index-scheduler/src/upgrade/mod.rs +++ b/crates/index-scheduler/src/upgrade/mod.rs @@ -39,6 +39,7 @@ pub fn upgrade_index_scheduler( (1, 13, _) => 0, (1, 14, _) => 0, (1, 15, _) => 0, + (1, 16, _) => 0, (major, minor, patch) => { if major > current_major || (major == current_major && minor > current_minor) diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index f53319a37..f1e8196f0 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -3,15 +3,18 @@ mod v1_13; mod v1_14; mod v1_15; mod v1_16; +mod v1_18; + use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; use v1_14::Latest_V1_13_To_Latest_V1_14; use v1_15::Latest_V1_14_To_Latest_V1_15; +use v1_16::Latest_V1_15_To_V1_16_0; +use v1_18::Latest_V1_17_To_V1_18_0; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::{Progress, VariableNameStep}; -use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0; use crate::{Index, InternalError, Result}; trait UpgradeIndex { @@ -34,6 +37,7 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[ &Latest_V1_13_To_Latest_V1_14 {}, &Latest_V1_14_To_Latest_V1_15 {}, &Latest_V1_15_To_V1_16_0 {}, + &Latest_V1_17_To_V1_18_0 {}, // This is the last upgrade function, it will be called when the index is up to date. // any other upgrade function should be added before this one. &ToCurrentNoOp {}, @@ -62,6 +66,7 @@ const fn start(from: (u32, u32, u32)) -> Option { // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 15, _) => function_index!(6), (1, 16, _) => function_index!(7), + (1, 18, _) => function_index!(8), // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually // considering dumpless upgrade. (_major, _minor, _patch) => return None, diff --git a/crates/milli/src/update/upgrade/v1_18.rs b/crates/milli/src/update/upgrade/v1_18.rs new file mode 100644 index 000000000..e20696b84 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_18.rs @@ -0,0 +1,34 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::vector::VectorStore; +use crate::{Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct Latest_V1_17_To_V1_18_0(); + +impl UpgradeIndex for Latest_V1_17_To_V1_18_0 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + let embedding_configs = index.embedding_configs(); + for config in embedding_configs.embedding_configs(wtxn)? { + // TODO use the embedder name to display progress + let quantized = config.config.quantized(); + let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap(); + let vector_store = VectorStore::new(index.vector_store, embedder_id, quantized); + vector_store.convert_from_arroy(wtxn)?; + } + + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 18, 0) + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index ba4402eed..b4d6e678a 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -8,6 +8,7 @@ use hannoy::distances::{BinaryQuantizedCosine, Cosine}; use hannoy::ItemId; use heed::{RoTxn, RwTxn, Unspecified}; use ordered_float::OrderedFloat; +use rand::SeedableRng; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -69,7 +70,7 @@ impl VectorStore { rtxn: &'a RoTxn<'a>, db: hannoy::Database, ) -> impl Iterator, hannoy::Error>> + 'a { - hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match hannoy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { Ok(false) => Some(Ok(reader)), @@ -82,6 +83,24 @@ impl VectorStore { }) } + fn arroy_readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) + } + /// The item ids that are present in the store specified by its id. /// /// The ids are accessed via a lambda to avoid lifetime shenanigans. @@ -136,6 +155,44 @@ impl VectorStore { } } + pub fn convert_from_arroy(&self, wtxn: &mut RwTxn) -> crate::Result<()> { + if self.quantized { + let dimensions = self + .arroy_readers(wtxn, self.arroy_quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let mut rng = rand::rngs::StdRng::from_entropy(); + let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions); + writer.prepare_arroy_conversion(wtxn)?; + writer.builder(&mut rng).build::(wtxn)?; + } + + Ok(()) + } else { + let dimensions = self + .arroy_readers(wtxn, self.arroy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let mut rng = rand::rngs::StdRng::from_entropy(); + let writer = hannoy::Writer::new(self.angular_db(), index, dimensions); + writer.prepare_arroy_conversion(wtxn)?; + writer.builder(&mut rng).build::(wtxn)?; + } + + Ok(()) + } + } + #[allow(clippy::too_many_arguments)] pub fn build_and_quantize( &mut self, @@ -147,7 +204,7 @@ impl VectorStore { hannoy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { @@ -204,7 +261,7 @@ impl VectorStore { ) -> Result<(), hannoy::Error> { let dimension = embeddings.dimension(); for (index, vector) in - hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) { if self.quantized { hannoy::Writer::new(self.quantized_db(), index, dimension) @@ -240,7 +297,7 @@ impl VectorStore { ) -> Result<(), hannoy::Error> { let dimension = vector.len(); - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let writer = hannoy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -289,7 +346,7 @@ impl VectorStore { dimension: usize, item_id: hannoy::ItemId, ) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); writer.del_item(wtxn, item_id)?; @@ -389,7 +446,7 @@ impl VectorStore { ) -> Result { let dimension = vector.len(); - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let writer = hannoy::Writer::new(db, index, dimension); if writer.contains_item(wtxn, item_id)? { return writer.del_item(wtxn, item_id); @@ -399,7 +456,7 @@ impl VectorStore { } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(wtxn)? { @@ -423,7 +480,7 @@ impl VectorStore { dimension: usize, item: hannoy::ItemId, ) -> Result { - for index in hannoy_store_range_for_embedder(self.embedder_index) { + for index in vector_store_range_for_embedder(self.embedder_index) { let contains = if self.quantized { let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(rtxn)? { @@ -557,6 +614,14 @@ impl VectorStore { self.database.remap_data_type() } + fn arroy_angular_db(&self) -> arroy::Database { + self.database.remap_types() + } + + fn arroy_quantized_db(&self) -> arroy::Database { + self.database.remap_types() + } + pub fn aggregate_stats( &self, rtxn: &RoTxn, @@ -1238,7 +1303,7 @@ pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } -fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { +fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator { (0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id)) }