diff --git a/Cargo.lock b/Cargo.lock index 1d027efab..a689649ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -442,6 +442,28 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arroy" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6" +dependencies = [ + "bytemuck", + "byteorder", + "enum-iterator", + "heed", + "memmap2", + "nohash", + "ordered-float 4.6.0", + "page_size", + "rand 0.8.5", + "rayon", + "roaring", + "tempfile", + "thiserror 2.0.12", + "tracing", +] + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -3927,6 +3949,7 @@ name = "milli" version = "1.18.0" dependencies = [ "allocator-api2 0.3.0", + "arroy", "bbqueue", "big_s", "bimap", @@ -4382,6 +4405,15 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ordered-float" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" +dependencies = [ + "num-traits", +] + [[package]] name = "ordered-float" version = "5.0.0" diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 439bba015..1a2110b5d 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -660,10 +660,10 @@ fn hair_dryer( match part { IndexPart::Hannoy => { let mut count = 0; - let total = index.vector_hannoy.len(&rtxn)?; + let total = index.vector_store.len(&rtxn)?; eprintln!("Hair drying hannoy for {uid}..."); for (i, result) in index - .vector_hannoy + .vector_store .remap_types::() .iter(&rtxn)? .enumerate() diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index e8327a26d..f1d5c1959 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -82,13 +82,12 @@ pub fn v1_10_to_v1_11( try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_STORE) .with_context(|| format!("while updating date format for index `{uid}`"))?; - // meilisearch_types::milli::hannoy::upgrade::cosine_from_0_4_to_0_5( - // &index_rtxn, - // index_read_database.remap_types(), - // &mut index_wtxn, - // index_write_database.remap_types(), - // )?; - unimplemented!("Hannoy doesn't support upgrading"); + meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5( + &index_rtxn, + index_read_database.remap_types(), + &mut index_wtxn, + index_write_database.remap_types(), + )?; index_wtxn.commit()?; } diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 812ac4376..b24a1779f 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -87,6 +87,7 @@ rhai = { version = "1.22.2", features = [ "no_time", "sync", ] } +arroy = "0.6.1" hannoy = { git = "https://github.com/nnethercott/hannoy", tag = "v0.0.1" } rand = "0.8.5" tracing = "0.1.41" diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2769284aa..787f42753 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -76,6 +76,8 @@ pub enum InternalError { #[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")] CannotUpgradeToVersion(u32, u32, u32), #[error(transparent)] + ArroyError(#[from] arroy::Error), + #[error(transparent)] HannoyError(#[from] hannoy::Error), #[error(transparent)] VectorEmbeddingError(#[from] crate::vector::Error), @@ -419,6 +421,28 @@ impl From for Error { } } +impl From for Error { + fn from(value: arroy::Error) -> Self { + match value { + arroy::Error::Heed(heed) => heed.into(), + arroy::Error::Io(io) => io.into(), + arroy::Error::InvalidVecDimension { expected, received } => { + Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) + } + arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), + arroy::Error::DatabaseFull + | arroy::Error::InvalidItemAppend + | arroy::Error::UnmatchingDistance { .. } + | arroy::Error::NeedBuild(_) + | arroy::Error::MissingKey { .. } + | arroy::Error::MissingMetadata(_) + | arroy::Error::CannotDecodeKeyMode { .. } => { + Error::InternalError(InternalError::ArroyError(value)) + } + } + } +} + impl From for Error { fn from(value: hannoy::Error) -> Self { match value { diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index d76ecfc9d..dc8298a1b 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -180,7 +180,7 @@ pub struct Index { /// Maps an embedder name to its id in the hannoy store. pub(crate) embedder_category_id: Database, /// Vector store based on hannoyâ„¢. - pub vector_hannoy: hannoy::Database, + pub vector_store: hannoy::Database, /// Maps the document id to the document as an obkv store. pub(crate) documents: Database, @@ -264,7 +264,7 @@ impl Index { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_hannoy, + vector_store: vector_hannoy, embedder_category_id, documents, }; @@ -1773,7 +1773,7 @@ impl Index { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); let has_fragments = config.config.embedder_options.has_fragments(); let reader = VectorStore::new( - self.vector_hannoy, + self.vector_store, embedder_info.embedder_id, config.config.quantized(), ); @@ -1798,7 +1798,7 @@ impl Index { for config in embedding_configs.embedding_configs(rtxn)? { let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let reader = - VectorStore::new(self.vector_hannoy, embedder_id, config.config.quantized()); + VectorStore::new(self.vector_store, embedder_id, config.config.quantized()); reader.aggregate_stats(rtxn, &mut stats)?; } Ok(stats) @@ -1842,7 +1842,7 @@ impl Index { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_hannoy, + vector_store: vector_hannoy, embedder_category_id, documents, } = self; diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 91ea87b68..ca867d6e0 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -53,7 +53,7 @@ pub use search::new::{ }; use serde_json::Value; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; -pub use {charabia as tokenizer, hannoy, heed, rhai}; +pub use {arroy, charabia as tokenizer, hannoy, heed, rhai}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::attribute_patterns::{AttributePatterns, PatternMatch}; diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index ce755c57d..284fcd431 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -56,7 +56,7 @@ impl VectorSort { let target = &self.target; let before = Instant::now(); - let reader = VectorStore::new(ctx.index.vector_hannoy, self.embedder_index, self.quantized); + let reader = VectorStore::new(ctx.index.vector_store, self.embedder_index, self.quantized); let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); *ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index ec3a5a565..83e65fd6a 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -72,7 +72,7 @@ impl<'a> Similar<'a> { crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) })?; - let reader = VectorStore::new(self.index.vector_hannoy, embedder_index, self.quantized); + let reader = VectorStore::new(self.index.vector_store, embedder_index, self.quantized); let results = reader.nns_by_item( self.rtxn, self.id, diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index 164aed9a0..6cd389d42 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -2,7 +2,8 @@ use heed::RwTxn; use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{database_stats::DatabaseStats, FieldDistribution, Index, Result}; +use crate::database_stats::DatabaseStats; +use crate::{FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'i> { wtxn: &'t mut RwTxn<'i>, @@ -45,7 +46,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_hannoy, + vector_store, embedder_category_id: _, documents, } = self.index; @@ -88,7 +89,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; // vector - vector_hannoy.clear(self.wtxn)?; + vector_store.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 5bfc8c218..f4d1552bd 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -493,8 +493,7 @@ where key: None, }, )?; - let reader = - VectorStore::new(self.index.vector_hannoy, index, action.was_quantized); + let reader = VectorStore::new(self.index.vector_store, index, action.was_quantized); let Some(dim) = reader.dimensions(self.wtxn)? else { continue; }; @@ -504,7 +503,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; - let vector_hannoy = self.index.vector_hannoy; + let vector_hannoy = self.index.vector_store; let cancel = &self.should_abort; let embedder_index = diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index b7c936a82..985f3a88f 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -842,7 +842,7 @@ impl<'a, 'i> Transform<'a, 'i> { action.write_back() { let reader = VectorStore::new( - self.index.vector_hannoy, + self.index.vector_store, *embedder_id, action.was_quantized, ); @@ -950,7 +950,7 @@ impl<'a, 'i> Transform<'a, 'i> { continue; }; let hannoy = - VectorStore::new(self.index.vector_hannoy, infos.embedder_id, was_quantized); + VectorStore::new(self.index.vector_store, infos.embedder_id, was_quantized); let Some(dimensions) = hannoy.dimensions(wtxn)? else { continue; }; diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 31616906c..fe5a8bde8 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -677,7 +677,7 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = VectorStore::new(index.vector_hannoy, infos.embedder_id, binary_quantized); + let writer = VectorStore::new(index.vector_store, infos.embedder_id, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e750d39a8..2d54c62b0 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -130,7 +130,7 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - let vector_arroy = index.vector_hannoy; + let vector_arroy = index.vector_store; let hannoy_writers: Result> = embedders .inner_as_ref() .iter() @@ -343,7 +343,7 @@ fn hannoy_writers_from_embedder_actions<'indexer>( embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, ) -> Result> { - let vector_arroy = index.vector_hannoy; + let vector_arroy = index.vector_store; embedders .inner_as_ref() @@ -385,7 +385,7 @@ where let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { continue; }; - let reader = VectorStore::new(index.vector_hannoy, *embedder_id, action.was_quantized); + let reader = VectorStore::new(index.vector_store, *embedder_id, action.was_quantized); let Some(dimensions) = reader.dimensions(wtxn)? else { continue; }; @@ -401,7 +401,7 @@ where let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { continue; }; - let arroy = VectorStore::new(index.vector_hannoy, infos.embedder_id, was_quantized); + let arroy = VectorStore::new(index.vector_store, infos.embedder_id, was_quantized); let Some(dimensions) = arroy.dimensions(wtxn)? else { continue; }; diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index af171f143..d04f9bb79 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -121,7 +121,7 @@ impl<'t> VectorDocumentFromDb<'t> { status: &EmbeddingStatus, ) -> Result> { let reader = - VectorStore::new(self.index.vector_hannoy, embedder_id, config.config.quantized()); + VectorStore::new(self.index.vector_store, embedder_id, config.config.quantized()); let vectors = reader.item_vectors(self.rtxn, self.docid)?; Ok(VectorEntry { diff --git a/crates/milli/src/update/upgrade/v1_14.rs b/crates/milli/src/update/upgrade/v1_14.rs index 68ff9f8cd..9950be706 100644 --- a/crates/milli/src/update/upgrade/v1_14.rs +++ b/crates/milli/src/update/upgrade/v1_14.rs @@ -1,4 +1,4 @@ -use hannoy::distances::Cosine; +use arroy::distances::Cosine; use heed::RwTxn; use super::UpgradeIndex; @@ -25,13 +25,12 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 { progress.update_progress(VectorStore::UpdateInternalVersions); let rtxn = index.read_txn()?; - // hannoy::upgrade::from_0_5_to_0_6::( - // &rtxn, - // index.vector_hannoy.remap_data_type(), - // wtxn, - // index.vector_hannoy.remap_data_type(), - // )?; - unimplemented!("upgrade hannoy"); + arroy::upgrade::from_0_5_to_0_6::( + &rtxn, + index.vector_store.remap_types(), + wtxn, + index.vector_store.remap_types(), + )?; Ok(false) }