Reintroduce arroy and support for dumpless upgrade from previous versions

This commit is contained in:
Clément Renault
2025-07-29 18:00:29 +02:00
committed by Louis Dureuil
parent db9f205184
commit 3bc192ae52
16 changed files with 94 additions and 39 deletions

32
Cargo.lock generated
View File

@ -442,6 +442,28 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arroy"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
dependencies = [
"bytemuck",
"byteorder",
"enum-iterator",
"heed",
"memmap2",
"nohash",
"ordered-float 4.6.0",
"page_size",
"rand 0.8.5",
"rayon",
"roaring",
"tempfile",
"thiserror 2.0.12",
"tracing",
]
[[package]] [[package]]
name = "assert-json-diff" name = "assert-json-diff"
version = "2.0.2" version = "2.0.2"
@ -3927,6 +3949,7 @@ name = "milli"
version = "1.18.0" version = "1.18.0"
dependencies = [ dependencies = [
"allocator-api2 0.3.0", "allocator-api2 0.3.0",
"arroy",
"bbqueue", "bbqueue",
"big_s", "big_s",
"bimap", "bimap",
@ -4382,6 +4405,15 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "ordered-float"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
dependencies = [
"num-traits",
]
[[package]] [[package]]
name = "ordered-float" name = "ordered-float"
version = "5.0.0" version = "5.0.0"

View File

@ -660,10 +660,10 @@ fn hair_dryer(
match part { match part {
IndexPart::Hannoy => { IndexPart::Hannoy => {
let mut count = 0; let mut count = 0;
let total = index.vector_hannoy.len(&rtxn)?; let total = index.vector_store.len(&rtxn)?;
eprintln!("Hair drying hannoy for {uid}..."); eprintln!("Hair drying hannoy for {uid}...");
for (i, result) in index for (i, result) in index
.vector_hannoy .vector_store
.remap_types::<Bytes, Bytes>() .remap_types::<Bytes, Bytes>()
.iter(&rtxn)? .iter(&rtxn)?
.enumerate() .enumerate()

View File

@ -82,13 +82,12 @@ pub fn v1_10_to_v1_11(
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_STORE) try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_STORE)
.with_context(|| format!("while updating date format for index `{uid}`"))?; .with_context(|| format!("while updating date format for index `{uid}`"))?;
// meilisearch_types::milli::hannoy::upgrade::cosine_from_0_4_to_0_5( meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5(
// &index_rtxn, &index_rtxn,
// index_read_database.remap_types(), index_read_database.remap_types(),
// &mut index_wtxn, &mut index_wtxn,
// index_write_database.remap_types(), index_write_database.remap_types(),
// )?; )?;
unimplemented!("Hannoy doesn't support upgrading");
index_wtxn.commit()?; index_wtxn.commit()?;
} }

View File

@ -87,6 +87,7 @@ rhai = { version = "1.22.2", features = [
"no_time", "no_time",
"sync", "sync",
] } ] }
arroy = "0.6.1"
hannoy = { git = "https://github.com/nnethercott/hannoy", tag = "v0.0.1" } hannoy = { git = "https://github.com/nnethercott/hannoy", tag = "v0.0.1" }
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.41" tracing = "0.1.41"

View File

@ -76,6 +76,8 @@ pub enum InternalError {
#[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")] #[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")]
CannotUpgradeToVersion(u32, u32, u32), CannotUpgradeToVersion(u32, u32, u32),
#[error(transparent)] #[error(transparent)]
ArroyError(#[from] arroy::Error),
#[error(transparent)]
HannoyError(#[from] hannoy::Error), HannoyError(#[from] hannoy::Error),
#[error(transparent)] #[error(transparent)]
VectorEmbeddingError(#[from] crate::vector::Error), VectorEmbeddingError(#[from] crate::vector::Error),
@ -419,6 +421,28 @@ impl From<crate::vector::Error> for Error {
} }
} }
impl From<arroy::Error> for Error {
fn from(value: arroy::Error) -> Self {
match value {
arroy::Error::Heed(heed) => heed.into(),
arroy::Error::Io(io) => io.into(),
arroy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
}
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. }
| arroy::Error::NeedBuild(_)
| arroy::Error::MissingKey { .. }
| arroy::Error::MissingMetadata(_)
| arroy::Error::CannotDecodeKeyMode { .. } => {
Error::InternalError(InternalError::ArroyError(value))
}
}
}
}
impl From<hannoy::Error> for Error { impl From<hannoy::Error> for Error {
fn from(value: hannoy::Error) -> Self { fn from(value: hannoy::Error) -> Self {
match value { match value {

View File

@ -180,7 +180,7 @@ pub struct Index {
/// Maps an embedder name to its id in the hannoy store. /// Maps an embedder name to its id in the hannoy store.
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>, pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
/// Vector store based on hannoy™. /// Vector store based on hannoy™.
pub vector_hannoy: hannoy::Database<Unspecified>, pub vector_store: hannoy::Database<Unspecified>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>, pub(crate) documents: Database<BEU32, ObkvCodec>,
@ -264,7 +264,7 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_hannoy, vector_store: vector_hannoy,
embedder_category_id, embedder_category_id,
documents, documents,
}; };
@ -1773,7 +1773,7 @@ impl Index {
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
let has_fragments = config.config.embedder_options.has_fragments(); let has_fragments = config.config.embedder_options.has_fragments();
let reader = VectorStore::new( let reader = VectorStore::new(
self.vector_hannoy, self.vector_store,
embedder_info.embedder_id, embedder_info.embedder_id,
config.config.quantized(), config.config.quantized(),
); );
@ -1798,7 +1798,7 @@ impl Index {
for config in embedding_configs.embedding_configs(rtxn)? { for config in embedding_configs.embedding_configs(rtxn)? {
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
let reader = let reader =
VectorStore::new(self.vector_hannoy, embedder_id, config.config.quantized()); VectorStore::new(self.vector_store, embedder_id, config.config.quantized());
reader.aggregate_stats(rtxn, &mut stats)?; reader.aggregate_stats(rtxn, &mut stats)?;
} }
Ok(stats) Ok(stats)
@ -1842,7 +1842,7 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_hannoy, vector_store: vector_hannoy,
embedder_category_id, embedder_category_id,
documents, documents,
} = self; } = self;

View File

@ -53,7 +53,7 @@ pub use search::new::{
}; };
use serde_json::Value; use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, hannoy, heed, rhai}; pub use {arroy, charabia as tokenizer, hannoy, heed, rhai};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::{AttributePatterns, PatternMatch}; pub use self::attribute_patterns::{AttributePatterns, PatternMatch};

View File

@ -56,7 +56,7 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
let target = &self.target; let target = &self.target;
let before = Instant::now(); let before = Instant::now();
let reader = VectorStore::new(ctx.index.vector_hannoy, self.embedder_index, self.quantized); let reader = VectorStore::new(ctx.index.vector_store, self.embedder_index, self.quantized);
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
self.cached_sorted_docids = results.into_iter(); self.cached_sorted_docids = results.into_iter();
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats { *ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {

View File

@ -72,7 +72,7 @@ impl<'a> Similar<'a> {
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
})?; })?;
let reader = VectorStore::new(self.index.vector_hannoy, embedder_index, self.quantized); let reader = VectorStore::new(self.index.vector_store, embedder_index, self.quantized);
let results = reader.nns_by_item( let results = reader.nns_by_item(
self.rtxn, self.rtxn,
self.id, self.id,

View File

@ -2,7 +2,8 @@ use heed::RwTxn;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use time::OffsetDateTime; use time::OffsetDateTime;
use crate::{database_stats::DatabaseStats, FieldDistribution, Index, Result}; use crate::database_stats::DatabaseStats;
use crate::{FieldDistribution, Index, Result};
pub struct ClearDocuments<'t, 'i> { pub struct ClearDocuments<'t, 'i> {
wtxn: &'t mut RwTxn<'i>, wtxn: &'t mut RwTxn<'i>,
@ -45,7 +46,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_hannoy, vector_store,
embedder_category_id: _, embedder_category_id: _,
documents, documents,
} = self.index; } = self.index;
@ -88,7 +89,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;
// vector // vector
vector_hannoy.clear(self.wtxn)?; vector_store.clear(self.wtxn)?;
documents.clear(self.wtxn)?; documents.clear(self.wtxn)?;

View File

@ -493,8 +493,7 @@ where
key: None, key: None,
}, },
)?; )?;
let reader = let reader = VectorStore::new(self.index.vector_store, index, action.was_quantized);
VectorStore::new(self.index.vector_hannoy, index, action.was_quantized);
let Some(dim) = reader.dimensions(self.wtxn)? else { let Some(dim) = reader.dimensions(self.wtxn)? else {
continue; continue;
}; };
@ -504,7 +503,7 @@ where
for (embedder_name, dimension) in dimension { for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn; let wtxn = &mut *self.wtxn;
let vector_hannoy = self.index.vector_hannoy; let vector_hannoy = self.index.vector_store;
let cancel = &self.should_abort; let cancel = &self.should_abort;
let embedder_index = let embedder_index =

View File

@ -842,7 +842,7 @@ impl<'a, 'i> Transform<'a, 'i> {
action.write_back() action.write_back()
{ {
let reader = VectorStore::new( let reader = VectorStore::new(
self.index.vector_hannoy, self.index.vector_store,
*embedder_id, *embedder_id,
action.was_quantized, action.was_quantized,
); );
@ -950,7 +950,7 @@ impl<'a, 'i> Transform<'a, 'i> {
continue; continue;
}; };
let hannoy = let hannoy =
VectorStore::new(self.index.vector_hannoy, infos.embedder_id, was_quantized); VectorStore::new(self.index.vector_store, infos.embedder_id, was_quantized);
let Some(dimensions) = hannoy.dimensions(wtxn)? else { let Some(dimensions) = hannoy.dimensions(wtxn)? else {
continue; continue;
}; };

View File

@ -677,7 +677,7 @@ pub(crate) fn write_typed_chunk_into_index(
.get(&embedder_name) .get(&embedder_name)
.is_some_and(|conf| conf.is_quantized); .is_some_and(|conf| conf.is_quantized);
// FIXME: allow customizing distance // FIXME: allow customizing distance
let writer = VectorStore::new(index.vector_hannoy, infos.embedder_id, binary_quantized); let writer = VectorStore::new(index.vector_store, infos.embedder_id, binary_quantized);
// remove vectors for docids we want them removed // remove vectors for docids we want them removed
let merger = remove_vectors_builder.build(); let merger = remove_vectors_builder.build();

View File

@ -130,7 +130,7 @@ where
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let vector_arroy = index.vector_hannoy; let vector_arroy = index.vector_store;
let hannoy_writers: Result<HashMap<_, _>> = embedders let hannoy_writers: Result<HashMap<_, _>> = embedders
.inner_as_ref() .inner_as_ref()
.iter() .iter()
@ -343,7 +343,7 @@ fn hannoy_writers_from_embedder_actions<'indexer>(
embedders: &'indexer RuntimeEmbedders, embedders: &'indexer RuntimeEmbedders,
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>, index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, VectorStore, usize)>> { ) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, VectorStore, usize)>> {
let vector_arroy = index.vector_hannoy; let vector_arroy = index.vector_store;
embedders embedders
.inner_as_ref() .inner_as_ref()
@ -385,7 +385,7 @@ where
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
continue; continue;
}; };
let reader = VectorStore::new(index.vector_hannoy, *embedder_id, action.was_quantized); let reader = VectorStore::new(index.vector_store, *embedder_id, action.was_quantized);
let Some(dimensions) = reader.dimensions(wtxn)? else { let Some(dimensions) = reader.dimensions(wtxn)? else {
continue; continue;
}; };
@ -401,7 +401,7 @@ where
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
continue; continue;
}; };
let arroy = VectorStore::new(index.vector_hannoy, infos.embedder_id, was_quantized); let arroy = VectorStore::new(index.vector_store, infos.embedder_id, was_quantized);
let Some(dimensions) = arroy.dimensions(wtxn)? else { let Some(dimensions) = arroy.dimensions(wtxn)? else {
continue; continue;
}; };

View File

@ -121,7 +121,7 @@ impl<'t> VectorDocumentFromDb<'t> {
status: &EmbeddingStatus, status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> { ) -> Result<VectorEntry<'t>> {
let reader = let reader =
VectorStore::new(self.index.vector_hannoy, embedder_id, config.config.quantized()); VectorStore::new(self.index.vector_store, embedder_id, config.config.quantized());
let vectors = reader.item_vectors(self.rtxn, self.docid)?; let vectors = reader.item_vectors(self.rtxn, self.docid)?;
Ok(VectorEntry { Ok(VectorEntry {

View File

@ -1,4 +1,4 @@
use hannoy::distances::Cosine; use arroy::distances::Cosine;
use heed::RwTxn; use heed::RwTxn;
use super::UpgradeIndex; use super::UpgradeIndex;
@ -25,13 +25,12 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 {
progress.update_progress(VectorStore::UpdateInternalVersions); progress.update_progress(VectorStore::UpdateInternalVersions);
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
// hannoy::upgrade::from_0_5_to_0_6::<Cosine>( arroy::upgrade::from_0_5_to_0_6::<Cosine>(
// &rtxn, &rtxn,
// index.vector_hannoy.remap_data_type(), index.vector_store.remap_types(),
// wtxn, wtxn,
// index.vector_hannoy.remap_data_type(), index.vector_store.remap_types(),
// )?; )?;
unimplemented!("upgrade hannoy");
Ok(false) Ok(false)
} }