Use Hannoy instead of arroy

This commit is contained in:
Kerollmops
2025-07-21 11:42:46 +02:00
committed by Clément Renault
parent d4c88f28f3
commit 296f06582c
25 changed files with 380 additions and 356 deletions

84
Cargo.lock generated
View File

@ -442,28 +442,6 @@ version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arroy"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
dependencies = [
"bytemuck",
"byteorder",
"enum-iterator",
"heed",
"memmap2",
"nohash",
"ordered-float 4.6.0",
"page_size",
"rand 0.8.5",
"rayon",
"roaring",
"tempfile",
"thiserror 2.0.12",
"tracing",
]
[[package]] [[package]]
name = "assert-json-diff" name = "assert-json-diff"
version = "2.0.2" version = "2.0.2"
@ -2600,6 +2578,31 @@ dependencies = [
"rand_distr", "rand_distr",
] ]
[[package]]
name = "hannoy"
version = "0.0.1"
source = "git+https://github.com/nnethercott/hannoy?tag=v0.0.1#d51750cd5612b6875375f5f4ad3928c87d55ee38"
dependencies = [
"bytemuck",
"byteorder",
"enum-iterator",
"hashbrown 0.15.4",
"heed",
"memmap2",
"min-max-heap",
"nohash",
"ordered-float 5.0.0",
"page_size",
"papaya",
"rand 0.8.5",
"rayon",
"roaring",
"tempfile",
"thiserror 2.0.12",
"tinyvec",
"tracing",
]
[[package]] [[package]]
name = "hash32" name = "hash32"
version = "0.3.1" version = "0.3.1"
@ -3922,7 +3925,6 @@ name = "milli"
version = "1.16.0" version = "1.16.0"
dependencies = [ dependencies = [
"allocator-api2 0.3.0", "allocator-api2 0.3.0",
"arroy",
"bbqueue", "bbqueue",
"big_s", "big_s",
"bimap", "bimap",
@ -3950,6 +3952,7 @@ dependencies = [
"fxhash", "fxhash",
"geoutils", "geoutils",
"grenad", "grenad",
"hannoy",
"hashbrown 0.15.4", "hashbrown 0.15.4",
"heed", "heed",
"hf-hub", "hf-hub",
@ -4019,6 +4022,12 @@ dependencies = [
"unicase", "unicase",
] ]
[[package]]
name = "min-max-heap"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2687e6cf9c00f48e9284cf9fd15f2ef341d03cc7743abf9df4c5f07fdee50b18"
[[package]] [[package]]
name = "minimal-lexical" name = "minimal-lexical"
version = "0.2.1" version = "0.2.1"
@ -4359,15 +4368,6 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "ordered-float"
version = "4.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
dependencies = [
"num-traits",
]
[[package]] [[package]]
name = "ordered-float" name = "ordered-float"
version = "5.0.0" version = "5.0.0"
@ -4399,6 +4399,16 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "papaya"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
dependencies = [
"equivalent",
"seize",
]
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.12.4" version = "0.12.4"
@ -5450,6 +5460,16 @@ dependencies = [
"time", "time",
] ]
[[package]]
name = "seize"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
dependencies = [
"libc",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "semver" name = "semver"
version = "1.0.26" version = "1.0.26"

View File

@ -143,10 +143,10 @@ impl IndexStats {
/// ///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
let arroy_stats = index.arroy_stats(rtxn)?; let hannoy_stats = index.hannoy_stats(rtxn)?;
Ok(IndexStats { Ok(IndexStats {
number_of_embeddings: Some(arroy_stats.number_of_embeddings), number_of_embeddings: Some(hannoy_stats.number_of_embeddings),
number_of_embedded_documents: Some(arroy_stats.documents.len()), number_of_embedded_documents: Some(hannoy_stats.documents.len()),
documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(), documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(),
number_of_documents: None, number_of_documents: None,
database_size: index.on_disk_size()?, database_size: index.on_disk_size()?,

View File

@ -320,7 +320,7 @@ async fn binary_quantize_clear_documents() {
} }
"###); "###);
// Make sure the arroy DB has been cleared // Make sure the hannoy DB has been cleared
let (documents, _code) = let (documents, _code) =
index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await; index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await;
snapshot!(documents, @r###" snapshot!(documents, @r###"

View File

@ -683,7 +683,7 @@ async fn clear_documents() {
} }
"###); "###);
// Make sure the arroy DB has been cleared // Make sure the hannoy DB has been cleared
let (documents, _code) = let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await; index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await;
snapshot!(documents, @r###" snapshot!(documents, @r###"

View File

@ -236,7 +236,7 @@ async fn reset_embedder_documents() {
} }
"###); "###);
// Make sure the arroy DB has been cleared // Make sure the hannoy DB has been cleared
let (documents, _code) = let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await; index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await;
snapshot!(json_string!(documents), @r###" snapshot!(json_string!(documents), @r###"

View File

@ -142,8 +142,8 @@ enum Command {
#[derive(Clone, ValueEnum)] #[derive(Clone, ValueEnum)]
enum IndexPart { enum IndexPart {
/// Will make the arroy index hot. /// Will make the hannoy index hot.
Arroy, Hannoy,
} }
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
@ -658,12 +658,12 @@ fn hair_dryer(
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
for part in index_parts { for part in index_parts {
match part { match part {
IndexPart::Arroy => { IndexPart::Hannoy => {
let mut count = 0; let mut count = 0;
let total = index.vector_arroy.len(&rtxn)?; let total = index.vector_hannoy.len(&rtxn)?;
eprintln!("Hair drying arroy for {uid}..."); eprintln!("Hair drying hannoy for {uid}...");
for (i, result) in index for (i, result) in index
.vector_arroy .vector_hannoy
.remap_types::<Bytes, Bytes>() .remap_types::<Bytes, Bytes>()
.iter(&rtxn)? .iter(&rtxn)?
.enumerate() .enumerate()

View File

@ -68,7 +68,7 @@ pub fn v1_10_to_v1_11(
) )
})?; })?;
let index_read_database = let index_read_database =
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_HANNOY)
.with_context(|| format!("while updating date format for index `{uid}`"))?; .with_context(|| format!("while updating date format for index `{uid}`"))?;
let mut index_wtxn = index_env.write_txn().with_context(|| { let mut index_wtxn = index_env.write_txn().with_context(|| {
@ -79,15 +79,16 @@ pub fn v1_10_to_v1_11(
})?; })?;
let index_write_database = let index_write_database =
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_HANNOY)
.with_context(|| format!("while updating date format for index `{uid}`"))?; .with_context(|| format!("while updating date format for index `{uid}`"))?;
meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5( // meilisearch_types::milli::hannoy::upgrade::cosine_from_0_4_to_0_5(
&index_rtxn, // &index_rtxn,
index_read_database.remap_types(), // index_read_database.remap_types(),
&mut index_wtxn, // &mut index_wtxn,
index_write_database.remap_types(), // index_write_database.remap_types(),
)?; // )?;
unimplemented!("Hannoy doesn't support upgrading");
index_wtxn.commit()?; index_wtxn.commit()?;
} }

View File

@ -87,7 +87,7 @@ rhai = { version = "1.22.2", features = [
"no_time", "no_time",
"sync", "sync",
] } ] }
arroy = "0.6.1" hannoy = { git = "https://github.com/nnethercott/hannoy", tag = "v0.0.1" }
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.41" tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] } ureq = { version = "2.12.1", features = ["json"] }

View File

@ -76,7 +76,7 @@ pub enum InternalError {
#[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")] #[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")]
CannotUpgradeToVersion(u32, u32, u32), CannotUpgradeToVersion(u32, u32, u32),
#[error(transparent)] #[error(transparent)]
ArroyError(#[from] arroy::Error), HannoyError(#[from] hannoy::Error),
#[error(transparent)] #[error(transparent)]
VectorEmbeddingError(#[from] crate::vector::Error), VectorEmbeddingError(#[from] crate::vector::Error),
} }
@ -419,23 +419,24 @@ impl From<crate::vector::Error> for Error {
} }
} }
impl From<arroy::Error> for Error { impl From<hannoy::Error> for Error {
fn from(value: arroy::Error) -> Self { fn from(value: hannoy::Error) -> Self {
match value { match value {
arroy::Error::Heed(heed) => heed.into(), hannoy::Error::Heed(heed) => heed.into(),
arroy::Error::Io(io) => io.into(), hannoy::Error::Io(io) => io.into(),
arroy::Error::InvalidVecDimension { expected, received } => { hannoy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
} }
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), hannoy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
arroy::Error::DatabaseFull hannoy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend | hannoy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. } | hannoy::Error::UnmatchingDistance { .. }
| arroy::Error::NeedBuild(_) | hannoy::Error::NeedBuild(_)
| arroy::Error::MissingKey { .. } | hannoy::Error::MissingKey { .. }
| arroy::Error::MissingMetadata(_) | hannoy::Error::MissingMetadata(_)
| arroy::Error::CannotDecodeKeyMode { .. } => { | hannoy::Error::UnknownVersion { .. }
Error::InternalError(InternalError::ArroyError(value)) | hannoy::Error::CannotDecodeKeyMode { .. } => {
Error::InternalError(InternalError::HannoyError(value))
} }
} }
} }

View File

@ -31,7 +31,7 @@ use crate::prompt::PromptData;
use crate::proximity::ProximityPrecision; use crate::proximity::ProximityPrecision;
use crate::update::new::StdResult; use crate::update::new::StdResult;
use crate::vector::db::IndexEmbeddingConfigs; use crate::vector::db::IndexEmbeddingConfigs;
use crate::vector::{ArroyStats, ArroyWrapper, Embedding}; use crate::vector::{Embedding, HannoyStats, HannoyWrapper};
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
@ -113,7 +113,7 @@ pub mod db_name {
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
pub const VECTOR_ARROY: &str = "vector-arroy"; pub const VECTOR_HANNOY: &str = "vector-hannoy";
pub const DOCUMENTS: &str = "documents"; pub const DOCUMENTS: &str = "documents";
} }
const NUMBER_OF_DBS: u32 = 25; const NUMBER_OF_DBS: u32 = 25;
@ -177,10 +177,10 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings. /// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>, pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps an embedder name to its id in the arroy store. /// Maps an embedder name to its id in the hannoy store.
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>, pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
/// Vector store based on arroy™. /// Vector store based on hannoy™.
pub vector_arroy: arroy::Database<Unspecified>, pub vector_hannoy: hannoy::Database<Unspecified>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>, pub(crate) documents: Database<BEU32, ObkvCodec>,
@ -237,7 +237,7 @@ impl Index {
// vector stuff // vector stuff
let embedder_category_id = let embedder_category_id =
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; let vector_hannoy = env.create_database(&mut wtxn, Some(VECTOR_HANNOY))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
@ -264,7 +264,7 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_arroy, vector_hannoy,
embedder_category_id, embedder_category_id,
documents, documents,
}; };
@ -1772,8 +1772,8 @@ impl Index {
for config in embedders.embedding_configs(rtxn)? { for config in embedders.embedding_configs(rtxn)? {
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
let has_fragments = config.config.embedder_options.has_fragments(); let has_fragments = config.config.embedder_options.has_fragments();
let reader = ArroyWrapper::new( let reader = HannoyWrapper::new(
self.vector_arroy, self.vector_hannoy,
embedder_info.embedder_id, embedder_info.embedder_id,
config.config.quantized(), config.config.quantized(),
); );
@ -1792,13 +1792,13 @@ impl Index {
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
} }
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> { pub fn hannoy_stats(&self, rtxn: &RoTxn<'_>) -> Result<HannoyStats> {
let mut stats = ArroyStats::default(); let mut stats = HannoyStats::default();
let embedding_configs = self.embedding_configs(); let embedding_configs = self.embedding_configs();
for config in embedding_configs.embedding_configs(rtxn)? { for config in embedding_configs.embedding_configs(rtxn)? {
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
let reader = let reader =
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); HannoyWrapper::new(self.vector_hannoy, embedder_id, config.config.quantized());
reader.aggregate_stats(rtxn, &mut stats)?; reader.aggregate_stats(rtxn, &mut stats)?;
} }
Ok(stats) Ok(stats)
@ -1842,7 +1842,7 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_arroy, vector_hannoy,
embedder_category_id, embedder_category_id,
documents, documents,
} = self; } = self;
@ -1913,7 +1913,7 @@ impl Index {
"field_id_docid_facet_strings", "field_id_docid_facet_strings",
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?, field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
); );
sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?); sizes.insert("vector_hannoy", vector_hannoy.stat(rtxn).map(compute_size)?);
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?); sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?); sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);

View File

@ -53,7 +53,7 @@ pub use search::new::{
}; };
use serde_json::Value; use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {arroy, charabia as tokenizer, heed, rhai}; pub use {charabia as tokenizer, hannoy, heed, rhai};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::attribute_patterns::{AttributePatterns, PatternMatch}; pub use self::attribute_patterns::{AttributePatterns, PatternMatch};

View File

@ -98,12 +98,12 @@ impl Progress {
} }
// TODO: ideally we should expose the progress in a way that let arroy use it directly // TODO: ideally we should expose the progress in a way that let arroy use it directly
pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) { // pub(crate) fn update_progress_from_hannoy(&self, progress: hannoy::WriterProgress) {
self.update_progress(progress.main); // self.update_progress(progress.main);
if let Some(sub) = progress.sub { // if let Some(sub) = progress.sub {
self.update_progress(sub); // self.update_progress(sub);
} // }
} // }
} }
/// Generate the names associated with the durations and push them. /// Generate the names associated with the durations and push them.
@ -277,43 +277,43 @@ impl<U: Send + Sync + 'static> Step for VariableNameStep<U> {
} }
} }
impl Step for arroy::MainStep { // impl Step for hannoy::MainStep {
fn name(&self) -> Cow<'static, str> { // fn name(&self) -> Cow<'static, str> {
match self { // match self {
arroy::MainStep::PreProcessingTheItems => "pre processing the items", // hannoy::MainStep::PreProcessingTheItems => "pre processing the items",
arroy::MainStep::WritingTheDescendantsAndMetadata => { // hannoy::MainStep::WritingTheDescendantsAndMetadata => {
"writing the descendants and metadata" // "writing the descendants and metadata"
} // }
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items", // hannoy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes", // hannoy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
arroy::MainStep::UpdatingTheTrees => "updating the trees", // hannoy::MainStep::UpdatingTheTrees => "updating the trees",
arroy::MainStep::CreateNewTrees => "create new trees", // hannoy::MainStep::CreateNewTrees => "create new trees",
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database", // hannoy::MainStep::WritingNodesToDatabase => "writing nodes to database",
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees", // hannoy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
arroy::MainStep::WriteTheMetadata => "write the metadata", // hannoy::MainStep::WriteTheMetadata => "write the metadata",
} // }
.into() // .into()
} // }
fn current(&self) -> u32 { // fn current(&self) -> u32 {
*self as u32 // *self as u32
} // }
fn total(&self) -> u32 { // fn total(&self) -> u32 {
Self::CARDINALITY as u32 // Self::CARDINALITY as u32
} // }
} // }
impl Step for arroy::SubStep { // impl Step for hannoy::SubStep {
fn name(&self) -> Cow<'static, str> { // fn name(&self) -> Cow<'static, str> {
self.unit.into() // self.unit.into()
} // }
fn current(&self) -> u32 { // fn current(&self) -> u32 {
self.current.load(Ordering::Relaxed) // self.current.load(Ordering::Relaxed)
} // }
fn total(&self) -> u32 { // fn total(&self) -> u32 {
self.max // self.max
} // }
} // }

View File

@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use super::VectorStoreStats; use super::VectorStoreStats;
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; use crate::vector::{DistributionShift, Embedder, HannoyWrapper};
use crate::{DocumentId, Result, SearchContext, SearchLogger}; use crate::{DocumentId, Result, SearchContext, SearchLogger};
pub struct VectorSort<Q: RankingRuleQueryTrait> { pub struct VectorSort<Q: RankingRuleQueryTrait> {
@ -56,7 +56,8 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
let target = &self.target; let target = &self.target;
let before = Instant::now(); let before = Instant::now();
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); let reader =
HannoyWrapper::new(ctx.index.vector_hannoy, self.embedder_index, self.quantized);
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
self.cached_sorted_docids = results.into_iter(); self.cached_sorted_docids = results.into_iter();
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats { *ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {

View File

@ -3,7 +3,7 @@ use std::sync::Arc;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::vector::{ArroyWrapper, Embedder}; use crate::vector::{Embedder, HannoyWrapper};
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
pub struct Similar<'a> { pub struct Similar<'a> {
@ -72,7 +72,7 @@ impl<'a> Similar<'a> {
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
})?; })?;
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let reader = HannoyWrapper::new(self.index.vector_hannoy, embedder_index, self.quantized);
let results = reader.nns_by_item( let results = reader.nns_by_item(
self.rtxn, self.rtxn,
self.id, self.id,

View File

@ -45,7 +45,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_arroy, vector_hannoy,
embedder_category_id: _, embedder_category_id: _,
documents, documents,
} = self.index; } = self.index;
@ -88,7 +88,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;
// vector // vector
vector_arroy.clear(self.wtxn)?; vector_hannoy.clear(self.wtxn)?;
documents.clear(self.wtxn)?; documents.clear(self.wtxn)?;

View File

@ -39,7 +39,7 @@ use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
}; };
use crate::vector::db::EmbedderInfo; use crate::vector::db::EmbedderInfo;
use crate::vector::{ArroyWrapper, RuntimeEmbedders}; use crate::vector::{HannoyWrapper, RuntimeEmbedders};
use crate::{CboRoaringBitmapCodec, Index, Result, UserError}; use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
static MERGED_DATABASE_COUNT: usize = 7; static MERGED_DATABASE_COUNT: usize = 7;
@ -494,7 +494,7 @@ where
}, },
)?; )?;
let reader = let reader =
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); HannoyWrapper::new(self.index.vector_hannoy, index, action.was_quantized);
let Some(dim) = reader.dimensions(self.wtxn)? else { let Some(dim) = reader.dimensions(self.wtxn)? else {
continue; continue;
}; };
@ -504,7 +504,7 @@ where
for (embedder_name, dimension) in dimension { for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn; let wtxn = &mut *self.wtxn;
let vector_arroy = self.index.vector_arroy; let vector_hannoy = self.index.vector_hannoy;
let cancel = &self.should_abort; let cancel = &self.should_abort;
let embedder_index = let embedder_index =
@ -523,7 +523,7 @@ where
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
pool.install(|| { pool.install(|| {
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); let mut writer = HannoyWrapper::new(vector_hannoy, embedder_index, was_quantized);
writer.build_and_quantize( writer.build_and_quantize(
wtxn, wtxn,
// In the settings we don't have any progress to share // In the settings we don't have any progress to share

View File

@ -32,7 +32,7 @@ use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::update::{AvailableIds, UpdateIndexingStep};
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments}; use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
use crate::vector::ArroyWrapper; use crate::vector::HannoyWrapper;
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
pub struct TransformOutput { pub struct TransformOutput {
@ -834,15 +834,15 @@ impl<'a, 'i> Transform<'a, 'i> {
None None
}; };
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff let readers: BTreeMap<&str, (HannoyWrapper, &RoaringBitmap)> = settings_diff
.embedding_config_updates .embedding_config_updates
.iter() .iter()
.filter_map(|(name, action)| { .filter_map(|(name, action)| {
if let Some(WriteBackToDocuments { embedder_id, user_provided }) = if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
action.write_back() action.write_back()
{ {
let reader = ArroyWrapper::new( let reader = HannoyWrapper::new(
self.index.vector_arroy, self.index.vector_hannoy,
*embedder_id, *embedder_id,
action.was_quantized, action.was_quantized,
); );
@ -884,7 +884,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let injected_vectors: std::result::Result< let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>, serde_json::Map<String, serde_json::Value>,
arroy::Error, hannoy::Error,
> = readers > = readers
.iter() .iter()
.filter_map(|(name, (reader, user_provided))| { .filter_map(|(name, (reader, user_provided))| {
@ -949,9 +949,9 @@ impl<'a, 'i> Transform<'a, 'i> {
else { else {
continue; continue;
}; };
let arroy = let hannoy =
ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized); HannoyWrapper::new(self.index.vector_hannoy, infos.embedder_id, was_quantized);
let Some(dimensions) = arroy.dimensions(wtxn)? else { let Some(dimensions) = hannoy.dimensions(wtxn)? else {
continue; continue;
}; };
for fragment_id in fragment_ids { for fragment_id in fragment_ids {
@ -959,17 +959,17 @@ impl<'a, 'i> Transform<'a, 'i> {
if infos.embedding_status.user_provided_docids().is_empty() { if infos.embedding_status.user_provided_docids().is_empty() {
// no user provided: clear store // no user provided: clear store
arroy.clear_store(wtxn, *fragment_id, dimensions)?; hannoy.clear_store(wtxn, *fragment_id, dimensions)?;
continue; continue;
} }
// some user provided, remove only the ids that are not user provided // some user provided, remove only the ids that are not user provided
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { let to_delete = hannoy.items_in_store(wtxn, *fragment_id, |items| {
items - infos.embedding_status.user_provided_docids() items - infos.embedding_status.user_provided_docids()
})?; })?;
for to_delete in to_delete { for to_delete in to_delete {
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; hannoy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
} }
} }
} }

View File

@ -27,7 +27,7 @@ use crate::update::index_documents::helpers::{
}; };
use crate::update::settings::InnerIndexSettingsDiff; use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig}; use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
use crate::vector::ArroyWrapper; use crate::vector::HannoyWrapper;
use crate::{ use crate::{
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
Result, SerializationError, U8StrStrCodec, Result, SerializationError, U8StrStrCodec,
@ -677,7 +677,8 @@ pub(crate) fn write_typed_chunk_into_index(
.get(&embedder_name) .get(&embedder_name)
.is_some_and(|conf| conf.is_quantized); .is_some_and(|conf| conf.is_quantized);
// FIXME: allow customizing distance // FIXME: allow customizing distance
let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized); let writer =
HannoyWrapper::new(index.vector_hannoy, infos.embedder_id, binary_quantized);
// remove vectors for docids we want them removed // remove vectors for docids we want them removed
let merger = remove_vectors_builder.build(); let merger = remove_vectors_builder.build();

View File

@ -255,9 +255,9 @@ impl<'a> From<FrameGrantR<'a>> for FrameWithHeader<'a> {
#[repr(u8)] #[repr(u8)]
pub enum EntryHeader { pub enum EntryHeader {
DbOperation(DbOperation), DbOperation(DbOperation),
ArroyDeleteVector(ArroyDeleteVector), HannoyDeleteVector(HannoyDeleteVector),
ArroySetVectors(ArroySetVectors), HannoySetVectors(HannoySetVectors),
ArroySetVector(ArroySetVector), HannoySetVector(HannoySetVector),
} }
impl EntryHeader { impl EntryHeader {
@ -268,9 +268,9 @@ impl EntryHeader {
const fn variant_id(&self) -> u8 { const fn variant_id(&self) -> u8 {
match self { match self {
EntryHeader::DbOperation(_) => 0, EntryHeader::DbOperation(_) => 0,
EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::HannoyDeleteVector(_) => 1,
EntryHeader::ArroySetVectors(_) => 2, EntryHeader::HannoySetVectors(_) => 2,
EntryHeader::ArroySetVector(_) => 3, EntryHeader::HannoySetVector(_) => 3,
} }
} }
@ -286,26 +286,26 @@ impl EntryHeader {
} }
const fn total_delete_vector_size() -> usize { const fn total_delete_vector_size() -> usize {
Self::variant_size() + mem::size_of::<ArroyDeleteVector>() Self::variant_size() + mem::size_of::<HannoyDeleteVector>()
} }
/// The `dimensions` corresponds to the number of `f32` in the embedding. /// The `dimensions` corresponds to the number of `f32` in the embedding.
fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { fn total_set_vectors_size(count: usize, dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>(); let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count Self::variant_size() + mem::size_of::<HannoySetVectors>() + embedding_size * count
} }
fn total_set_vector_size(dimensions: usize) -> usize { fn total_set_vector_size(dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>(); let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size Self::variant_size() + mem::size_of::<HannoySetVector>() + embedding_size
} }
fn header_size(&self) -> usize { fn header_size(&self) -> usize {
let payload_size = match self { let payload_size = match self {
EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::DbOperation(op) => mem::size_of_val(op),
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::HannoyDeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), EntryHeader::HannoySetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), EntryHeader::HannoySetVector(asv) => mem::size_of_val(asv),
}; };
Self::variant_size() + payload_size Self::variant_size() + payload_size
} }
@ -319,19 +319,19 @@ impl EntryHeader {
EntryHeader::DbOperation(header) EntryHeader::DbOperation(header)
} }
1 => { 1 => {
let header_bytes = &remaining[..mem::size_of::<ArroyDeleteVector>()]; let header_bytes = &remaining[..mem::size_of::<HannoyDeleteVector>()];
let header = checked::pod_read_unaligned(header_bytes); let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroyDeleteVector(header) EntryHeader::HannoyDeleteVector(header)
} }
2 => { 2 => {
let header_bytes = &remaining[..mem::size_of::<ArroySetVectors>()]; let header_bytes = &remaining[..mem::size_of::<HannoySetVectors>()];
let header = checked::pod_read_unaligned(header_bytes); let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVectors(header) EntryHeader::HannoySetVectors(header)
} }
3 => { 3 => {
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()]; let header_bytes = &remaining[..mem::size_of::<HannoySetVector>()];
let header = checked::pod_read_unaligned(header_bytes); let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::ArroySetVector(header) EntryHeader::HannoySetVector(header)
} }
id => panic!("invalid variant id: {id}"), id => panic!("invalid variant id: {id}"),
} }
@ -341,9 +341,9 @@ impl EntryHeader {
let (first, remaining) = header_bytes.split_first_mut().unwrap(); let (first, remaining) = header_bytes.split_first_mut().unwrap();
let payload_bytes = match self { let payload_bytes = match self {
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::HannoyDeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), EntryHeader::HannoySetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), EntryHeader::HannoySetVector(asv) => bytemuck::bytes_of(asv),
}; };
*first = self.variant_id(); *first = self.variant_id();
remaining.copy_from_slice(payload_bytes); remaining.copy_from_slice(payload_bytes);
@ -378,7 +378,7 @@ impl DbOperation {
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
#[repr(transparent)] #[repr(transparent)]
pub struct ArroyDeleteVector { pub struct HannoyDeleteVector {
pub docid: DocumentId, pub docid: DocumentId,
} }
@ -386,13 +386,13 @@ pub struct ArroyDeleteVector {
#[repr(C)] #[repr(C)]
/// The embeddings are in the remaining space and represents /// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s. /// non-aligned [f32] each with dimensions f32s.
pub struct ArroySetVectors { pub struct HannoySetVectors {
pub docid: DocumentId, pub docid: DocumentId,
pub embedder_id: u8, pub embedder_id: u8,
_padding: [u8; 3], _padding: [u8; 3],
} }
impl ArroySetVectors { impl HannoySetVectors {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>(); let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..] &frame[skip..]
@ -416,14 +416,14 @@ impl ArroySetVectors {
#[repr(C)] #[repr(C)]
/// The embeddings are in the remaining space and represents /// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s. /// non-aligned [f32] each with dimensions f32s.
pub struct ArroySetVector { pub struct HannoySetVector {
pub docid: DocumentId, pub docid: DocumentId,
pub embedder_id: u8, pub embedder_id: u8,
pub extractor_id: u8, pub extractor_id: u8,
_padding: [u8; 2], _padding: [u8; 2],
} }
impl ArroySetVector { impl HannoySetVector {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>(); let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..] &frame[skip..]
@ -553,7 +553,7 @@ impl<'b> ExtractorBbqueueSender<'b> {
let refcell = self.producers.get().unwrap(); let refcell = self.producers.get().unwrap();
let mut producer = refcell.0.borrow_mut_or_yield(); let mut producer = refcell.0.borrow_mut_or_yield();
let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); let payload_header = EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid });
let total_length = EntryHeader::total_delete_vector_size(); let total_length = EntryHeader::total_delete_vector_size();
if total_length > max_grant { if total_length > max_grant {
panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)"); panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)");
@ -589,8 +589,8 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all // to zero to allocate no extra space at all
let dimensions = embeddings.first().map_or(0, |emb| emb.len()); let dimensions = embeddings.first().map_or(0, |emb| emb.len());
let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; let hannoy_set_vector = HannoySetVectors { docid, embedder_id, _padding: [0; 3] };
let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); let payload_header = EntryHeader::HannoySetVectors(hannoy_set_vector);
let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions);
if total_length > max_grant { if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?; let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
@ -650,9 +650,9 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all // to zero to allocate no extra space at all
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len()); let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
let arroy_set_vector = let hannoy_set_vector =
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; HannoySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); let payload_header = EntryHeader::HannoySetVector(hannoy_set_vector);
let total_length = EntryHeader::total_set_vector_size(dimensions); let total_length = EntryHeader::total_set_vector_size(dimensions);
if total_length > max_grant { if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?; let mut value_file = tempfile::tempfile().map(BufWriter::new)?;

View File

@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
use crate::update::settings::SettingsDelta; use crate::update::settings::SettingsDelta;
use crate::update::GrenadParameters; use crate::update::GrenadParameters;
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::vector::{Embedder, HannoyWrapper, RuntimeEmbedders};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
pub(crate) mod de; pub(crate) mod de;
@ -66,7 +66,7 @@ where
let mut bbbuffers = Vec::new(); let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false); let finished_extraction = AtomicBool::new(false);
let arroy_memory = grenad_parameters.max_memory; let hannoy_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) = let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters); indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@ -129,8 +129,8 @@ where
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let vector_arroy = index.vector_arroy; let vector_arroy = index.vector_hannoy;
let arroy_writers: Result<HashMap<_, _>> = embedders let hannoy_writers: Result<HashMap<_, _>> = embedders
.inner_as_ref() .inner_as_ref()
.iter() .iter()
.map(|(embedder_name, runtime)| { .map(|(embedder_name, runtime)| {
@ -143,7 +143,7 @@ where
})?; })?;
let dimensions = runtime.embedder.dimensions(); let dimensions = runtime.embedder.dimensions();
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized); let writer = HannoyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
Ok(( Ok((
embedder_index, embedder_index,
@ -152,10 +152,10 @@ where
}) })
.collect(); .collect();
let mut arroy_writers = arroy_writers?; let mut hannoy_writers = hannoy_writers?;
let congestion = let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@ -169,8 +169,8 @@ where
wtxn, wtxn,
indexing_context.progress, indexing_context.progress,
index_embeddings, index_embeddings,
arroy_memory, hannoy_memory,
&mut arroy_writers, &mut hannoy_writers,
None, None,
&indexing_context.must_stop_processing, &indexing_context.must_stop_processing,
) )
@ -226,7 +226,7 @@ where
let mut bbbuffers = Vec::new(); let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false); let finished_extraction = AtomicBool::new(false);
let arroy_memory = grenad_parameters.max_memory; let hannoy_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) = let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters); indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@ -283,7 +283,7 @@ where
let new_embedders = settings_delta.new_embedders(); let new_embedders = settings_delta.new_embedders();
let embedder_actions = settings_delta.embedder_actions(); let embedder_actions = settings_delta.embedder_actions();
let index_embedder_category_ids = settings_delta.new_embedder_category_id(); let index_embedder_category_ids = settings_delta.new_embedder_category_id();
let mut arroy_writers = arroy_writers_from_embedder_actions( let mut hannoy_writers = hannoy_writers_from_embedder_actions(
index, index,
embedder_actions, embedder_actions,
new_embedders, new_embedders,
@ -291,7 +291,7 @@ where
)?; )?;
let congestion = let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@ -305,8 +305,8 @@ where
wtxn, wtxn,
indexing_context.progress, indexing_context.progress,
index_embeddings, index_embeddings,
arroy_memory, hannoy_memory,
&mut arroy_writers, &mut hannoy_writers,
Some(embedder_actions), Some(embedder_actions),
&indexing_context.must_stop_processing, &indexing_context.must_stop_processing,
) )
@ -336,13 +336,13 @@ where
Ok(congestion) Ok(congestion)
} }
fn arroy_writers_from_embedder_actions<'indexer>( fn hannoy_writers_from_embedder_actions<'indexer>(
index: &Index, index: &Index,
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>, embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
embedders: &'indexer RuntimeEmbedders, embedders: &'indexer RuntimeEmbedders,
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>, index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, ArroyWrapper, usize)>> { ) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, HannoyWrapper, usize)>> {
let vector_arroy = index.vector_arroy; let vector_arroy = index.vector_hannoy;
embedders embedders
.inner_as_ref() .inner_as_ref()
@ -361,7 +361,7 @@ fn arroy_writers_from_embedder_actions<'indexer>(
))); )));
}; };
let writer = let writer =
ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); HannoyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized);
let dimensions = runtime.embedder.dimensions(); let dimensions = runtime.embedder.dimensions();
Some(Ok(( Some(Ok((
embedder_category_id, embedder_category_id,
@ -384,7 +384,7 @@ where
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
continue; continue;
}; };
let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); let reader = HannoyWrapper::new(index.vector_hannoy, *embedder_id, action.was_quantized);
let Some(dimensions) = reader.dimensions(wtxn)? else { let Some(dimensions) = reader.dimensions(wtxn)? else {
continue; continue;
}; };
@ -400,7 +400,7 @@ where
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
continue; continue;
}; };
let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized); let arroy = HannoyWrapper::new(index.vector_hannoy, infos.embedder_id, was_quantized);
let Some(dimensions) = arroy.dimensions(wtxn)? else { let Some(dimensions) = arroy.dimensions(wtxn)? else {
continue; continue;
}; };

View File

@ -15,7 +15,7 @@ use crate::progress::Progress;
use crate::update::settings::InnerIndexSettings; use crate::update::settings::InnerIndexSettings;
use crate::vector::db::IndexEmbeddingConfig; use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::settings::EmbedderAction; use crate::vector::settings::EmbedderAction;
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders}; use crate::vector::{Embedder, Embeddings, HannoyWrapper, RuntimeEmbedders};
use crate::{Error, Index, InternalError, Result, UserError}; use crate::{Error, Index, InternalError, Result, UserError};
pub fn write_to_db( pub fn write_to_db(
@ -23,9 +23,9 @@ pub fn write_to_db(
finished_extraction: &AtomicBool, finished_extraction: &AtomicBool,
index: &Index, index: &Index,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
arroy_writers: &HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>, hannoy_writers: &HashMap<u8, (&str, &Embedder, HannoyWrapper, usize)>,
) -> Result<ChannelCongestion> { ) -> Result<ChannelCongestion> {
// Used by by the ArroySetVector to copy the embedding into an // Used by by the HannoySetVector to copy the embedding into an
// aligned memory area, required by arroy to accept a new vector. // aligned memory area, required by arroy to accept a new vector.
let mut aligned_embedding = Vec::new(); let mut aligned_embedding = Vec::new();
let span = tracing::trace_span!(target: "indexing::write_db", "all"); let span = tracing::trace_span!(target: "indexing::write_db", "all");
@ -56,7 +56,7 @@ pub fn write_to_db(
ReceiverAction::LargeVectors(large_vectors) => { ReceiverAction::LargeVectors(large_vectors) => {
let LargeVectors { docid, embedder_id, .. } = large_vectors; let LargeVectors { docid, embedder_id, .. } = large_vectors;
let (_, _, writer, dimensions) = let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder"); hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions); let mut embeddings = Embeddings::new(*dimensions);
for embedding in large_vectors.read_embeddings(*dimensions) { for embedding in large_vectors.read_embeddings(*dimensions) {
embeddings.push(embedding.to_vec()).unwrap(); embeddings.push(embedding.to_vec()).unwrap();
@ -68,7 +68,7 @@ pub fn write_to_db(
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. }, large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
) => { ) => {
let (_, _, writer, dimensions) = let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder"); hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = large_vector.read_embedding(*dimensions); let embedding = large_vector.read_embedding(*dimensions);
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
} }
@ -80,12 +80,12 @@ pub fn write_to_db(
&mut writer_receiver, &mut writer_receiver,
index, index,
wtxn, wtxn,
arroy_writers, hannoy_writers,
&mut aligned_embedding, &mut aligned_embedding,
)?; )?;
} }
write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?; write_from_bbqueue(&mut writer_receiver, index, wtxn, hannoy_writers, &mut aligned_embedding)?;
Ok(ChannelCongestion { Ok(ChannelCongestion {
attempts: writer_receiver.sent_messages_attempts(), attempts: writer_receiver.sent_messages_attempts(),
@ -115,8 +115,8 @@ pub fn build_vectors<MSP>(
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
progress: &Progress, progress: &Progress,
index_embeddings: Vec<IndexEmbeddingConfig>, index_embeddings: Vec<IndexEmbeddingConfig>,
arroy_memory: Option<usize>, hannoy_memory: Option<usize>,
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>, hannoy_writers: &mut HashMap<u8, (&str, &Embedder, HannoyWrapper, usize)>,
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>, embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
must_stop_processing: &MSP, must_stop_processing: &MSP,
) -> Result<()> ) -> Result<()>
@ -129,7 +129,7 @@ where
let seed = rand::random(); let seed = rand::random();
let mut rng = rand::rngs::StdRng::seed_from_u64(seed); let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers { for (_index, (embedder_name, _embedder, writer, dimensions)) in hannoy_writers {
let dimensions = *dimensions; let dimensions = *dimensions;
let is_being_quantized = embeder_actions let is_being_quantized = embeder_actions
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized)) .and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
@ -140,7 +140,7 @@ where
&mut rng, &mut rng,
dimensions, dimensions,
is_being_quantized, is_being_quantized,
arroy_memory, hannoy_memory,
must_stop_processing, must_stop_processing,
)?; )?;
} }
@ -181,7 +181,7 @@ pub fn write_from_bbqueue(
writer_receiver: &mut WriterBbqueueReceiver<'_>, writer_receiver: &mut WriterBbqueueReceiver<'_>,
index: &Index, index: &Index,
wtxn: &mut RwTxn<'_>, wtxn: &mut RwTxn<'_>,
arroy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, ArroyWrapper, usize)>, hannoy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, HannoyWrapper, usize)>,
aligned_embedding: &mut Vec<f32>, aligned_embedding: &mut Vec<f32>,
) -> crate::Result<()> { ) -> crate::Result<()> {
while let Some(frame_with_header) = writer_receiver.recv_frame() { while let Some(frame_with_header) = writer_receiver.recv_frame() {
@ -221,17 +221,17 @@ pub fn write_from_bbqueue(
}, },
} }
} }
EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid }) => {
for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { for (_index, (_name, _embedder, writer, dimensions)) in hannoy_writers {
let dimensions = *dimensions; let dimensions = *dimensions;
writer.del_items(wtxn, dimensions, docid)?; writer.del_items(wtxn, dimensions, docid)?;
} }
} }
EntryHeader::ArroySetVectors(asvs) => { EntryHeader::HannoySetVectors(asvs) => {
let ArroySetVectors { docid, embedder_id, .. } = asvs; let HannoySetVectors { docid, embedder_id, .. } = asvs;
let frame = frame_with_header.frame(); let frame = frame_with_header.frame();
let (_, _, writer, dimensions) = let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder"); hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions); let mut embeddings = Embeddings::new(*dimensions);
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
writer.del_items(wtxn, *dimensions, docid)?; writer.del_items(wtxn, *dimensions, docid)?;
@ -245,12 +245,12 @@ pub fn write_from_bbqueue(
writer.add_items(wtxn, docid, &embeddings)?; writer.add_items(wtxn, docid, &embeddings)?;
} }
} }
EntryHeader::ArroySetVector( EntryHeader::HannoySetVector(
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. }, asv @ HannoySetVector { docid, embedder_id, extractor_id, .. },
) => { ) => {
let frame = frame_with_header.frame(); let frame = frame_with_header.frame();
let (_, _, writer, dimensions) = let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder"); hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding); let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
if embedding.is_empty() { if embedding.is_empty() {

View File

@ -14,7 +14,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::documents::FieldIdMapper; use crate::documents::FieldIdMapper;
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig}; use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders}; use crate::vector::{Embedding, HannoyWrapper, RuntimeEmbedders};
use crate::{DocumentId, Index, InternalError, Result, UserError}; use crate::{DocumentId, Index, InternalError, Result, UserError};
#[derive(Serialize)] #[derive(Serialize)]
@ -121,7 +121,7 @@ impl<'t> VectorDocumentFromDb<'t> {
status: &EmbeddingStatus, status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> { ) -> Result<VectorEntry<'t>> {
let reader = let reader =
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); HannoyWrapper::new(self.index.vector_hannoy, embedder_id, config.config.quantized());
let vectors = reader.item_vectors(self.rtxn, self.docid)?; let vectors = reader.item_vectors(self.rtxn, self.docid)?;
Ok(VectorEntry { Ok(VectorEntry {
@ -149,7 +149,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
name, name,
entry_from_raw_value(value, false).map_err(|_| { entry_from_raw_value(value, false).map_err(|_| {
InternalError::Serialization(crate::SerializationError::Decoding { InternalError::Serialization(crate::SerializationError::Decoding {
db_name: Some(crate::index::db_name::VECTOR_ARROY), db_name: Some(crate::index::db_name::VECTOR_HANNOY),
}) })
})?, })?,
)) ))
@ -167,7 +167,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
Some(embedding_from_doc) => { Some(embedding_from_doc) => {
Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| { Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| {
InternalError::Serialization(crate::SerializationError::Decoding { InternalError::Serialization(crate::SerializationError::Decoding {
db_name: Some(crate::index::db_name::VECTOR_ARROY), db_name: Some(crate::index::db_name::VECTOR_HANNOY),
}) })
})?) })?)
} }

View File

@ -1,4 +1,4 @@
use arroy::distances::Cosine; use hannoy::distances::Cosine;
use heed::RwTxn; use heed::RwTxn;
use super::UpgradeIndex; use super::UpgradeIndex;
@ -25,12 +25,13 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 {
progress.update_progress(VectorStore::UpdateInternalVersions); progress.update_progress(VectorStore::UpdateInternalVersions);
let rtxn = index.read_txn()?; let rtxn = index.read_txn()?;
arroy::upgrade::from_0_5_to_0_6::<Cosine>( // hannoy::upgrade::from_0_5_to_0_6::<Cosine>(
&rtxn, // &rtxn,
index.vector_arroy.remap_data_type(), // index.vector_hannoy.remap_data_type(),
wtxn, // wtxn,
index.vector_arroy.remap_data_type(), // index.vector_hannoy.remap_data_type(),
)?; // )?;
unimplemented!("upgrade hannoy");
Ok(false) Ok(false)
} }

View File

@ -1,6 +1,6 @@
use std::time::Instant; use std::time::Instant;
use arroy::Distance; use hannoy::Distance;
use super::error::CompositeEmbedderContainsHuggingFace; use super::error::CompositeEmbedderContainsHuggingFace;
use super::{ use super::{
@ -324,19 +324,18 @@ fn check_similarity(
} }
for (left, right) in left.into_iter().zip(right) { for (left, right) in left.into_iter().zip(right) {
let left = arroy::internals::UnalignedVector::from_slice(&left); let left = hannoy::internals::UnalignedVector::from_slice(&left);
let right = arroy::internals::UnalignedVector::from_slice(&right); let right = hannoy::internals::UnalignedVector::from_slice(&right);
let left = arroy::internals::Leaf { let left = hannoy::internals::Item {
header: arroy::distances::Cosine::new_header(&left), header: hannoy::distances::Cosine::new_header(&left),
vector: left, vector: left,
}; };
let right = arroy::internals::Leaf { let right = hannoy::internals::Item {
header: arroy::distances::Cosine::new_header(&right), header: hannoy::distances::Cosine::new_header(&right),
vector: right, vector: right,
}; };
let distance = arroy::distances::Cosine::built_distance(&left, &right); let distance = hannoy::distances::Cosine::distance(&left, &right);
if distance > super::MAX_COMPOSITE_DISTANCE { if distance > super::MAX_COMPOSITE_DISTANCE {
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint)); return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
} }

View File

@ -3,9 +3,9 @@ use std::num::NonZeroUsize;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::time::Instant; use std::time::Instant;
use arroy::distances::{BinaryQuantizedCosine, Cosine};
use arroy::ItemId;
use deserr::{DeserializeError, Deserr}; use deserr::{DeserializeError, Deserr};
use hannoy::distances::{BinaryQuantizedCosine, Cosine};
use hannoy::ItemId;
use heed::{RoTxn, RwTxn, Unspecified}; use heed::{RoTxn, RwTxn, Unspecified};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -41,15 +41,15 @@ pub type Embedding = Vec<f32>;
pub const REQUEST_PARALLELISM: usize = 40; pub const REQUEST_PARALLELISM: usize = 40;
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01; pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
pub struct ArroyWrapper { pub struct HannoyWrapper {
quantized: bool, quantized: bool,
embedder_index: u8, embedder_index: u8,
database: arroy::Database<Unspecified>, database: hannoy::Database<Unspecified>,
} }
impl ArroyWrapper { impl HannoyWrapper {
pub fn new( pub fn new(
database: arroy::Database<Unspecified>, database: hannoy::Database<Unspecified>,
embedder_index: u8, embedder_index: u8,
quantized: bool, quantized: bool,
) -> Self { ) -> Self {
@ -60,19 +60,19 @@ impl ArroyWrapper {
self.embedder_index self.embedder_index
} }
fn readers<'a, D: arroy::Distance>( fn readers<'a, D: hannoy::Distance>(
&'a self, &'a self,
rtxn: &'a RoTxn<'a>, rtxn: &'a RoTxn<'a>,
db: arroy::Database<D>, db: hannoy::Database<D>,
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a { ) -> impl Iterator<Item = Result<hannoy::Reader<'a, D>, hannoy::Error>> + 'a {
arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
match arroy::Reader::open(rtxn, index, db) { match hannoy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) { Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)), Ok(false) => Some(Ok(reader)),
Ok(true) => None, Ok(true) => None,
Err(e) => Some(Err(e)), Err(e) => Some(Err(e)),
}, },
Err(arroy::Error::MissingMetadata(_)) => None, Err(hannoy::Error::MissingMetadata(_)) => None,
Err(e) => Some(Err(e)), Err(e) => Some(Err(e)),
} }
}) })
@ -86,7 +86,7 @@ impl ArroyWrapper {
rtxn: &RoTxn, rtxn: &RoTxn,
store_id: u8, store_id: u8,
with_items: F, with_items: F,
) -> Result<O, arroy::Error> ) -> Result<O, hannoy::Error>
where where
F: FnOnce(&RoaringBitmap) -> O, F: FnOnce(&RoaringBitmap) -> O,
{ {
@ -97,26 +97,26 @@ impl ArroyWrapper {
} }
} }
fn _items_in_store<D: arroy::Distance, F, O>( fn _items_in_store<D: hannoy::Distance, F, O>(
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
store_id: u8, store_id: u8,
with_items: F, with_items: F,
) -> Result<O, arroy::Error> ) -> Result<O, hannoy::Error>
where where
F: FnOnce(&RoaringBitmap) -> O, F: FnOnce(&RoaringBitmap) -> O,
{ {
let index = arroy_store_for_embedder(self.embedder_index, store_id); let index = hannoy_store_for_embedder(self.embedder_index, store_id);
let reader = arroy::Reader::open(rtxn, index, db); let reader = hannoy::Reader::open(rtxn, index, db);
match reader { match reader {
Ok(reader) => Ok(with_items(reader.item_ids())), Ok(reader) => Ok(with_items(reader.item_ids())),
Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), Err(hannoy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())),
Err(err) => Err(err), Err(err) => Err(err),
} }
} }
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, arroy::Error> { pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, hannoy::Error> {
if self.quantized { if self.quantized {
Ok(self Ok(self
.readers(rtxn, self.quantized_db()) .readers(rtxn, self.quantized_db())
@ -140,39 +140,41 @@ impl ArroyWrapper {
rng: &mut R, rng: &mut R,
dimension: usize, dimension: usize,
quantizing: bool, quantizing: bool,
arroy_memory: Option<usize>, hannoy_memory: Option<usize>,
cancel: &(impl Fn() -> bool + Sync + Send), cancel: &(impl Fn() -> bool + Sync + Send),
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.need_build(wtxn)? { if writer.need_build(wtxn)? {
writer.builder(rng).build(wtxn)? writer.builder(rng).ef_construction(48).build::<16, 32>(wtxn)?
} else if writer.is_empty(wtxn)? { } else if writer.is_empty(wtxn)? {
continue; continue;
} }
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
// If we are quantizing the databases, we can't know from meilisearch // If we are quantizing the databases, we can't know from meilisearch
// if the db was empty but still contained the wrong metadata, thus we need // if the db was empty but still contained the wrong metadata, thus we need
// to quantize everything and can't stop early. Since this operation can // to quantize everything and can't stop early. Since this operation can
// only happens once in the life of an embedder, it's not very performances // only happens once in the life of an embedder, it's not very performances
// sensitive. // sensitive.
if quantizing && !self.quantized { if quantizing && !self.quantized {
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?; // let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
writer // writer
.builder(rng) // .builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX)) // .available_memory(hannoy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step)) // .progress(|step| progress.update_progress_from_hannoy(step))
.cancel(cancel) // .cancel(cancel)
.build(wtxn)?; // .build(wtxn)?;
unimplemented!("switching from quantized to non-quantized");
} else if writer.need_build(wtxn)? { } else if writer.need_build(wtxn)? {
writer writer
.builder(rng) .builder(rng)
.available_memory(arroy_memory.unwrap_or(usize::MAX)) .available_memory(hannoy_memory.unwrap_or(usize::MAX))
.progress(|step| progress.update_progress_from_arroy(step)) // .progress(|step| progress.update_progress_from_hannoy(step))
.cancel(cancel) // .cancel(cancel)
.build(wtxn)?; .ef_construction(48)
.build::<16, 32>(wtxn)?;
} else if writer.is_empty(wtxn)? { } else if writer.is_empty(wtxn)? {
continue; continue;
} }
@ -188,18 +190,18 @@ impl ArroyWrapper {
pub fn add_items( pub fn add_items(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
embeddings: &Embeddings<f32>, embeddings: &Embeddings<f32>,
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
let dimension = embeddings.dimension(); let dimension = embeddings.dimension();
for (index, vector) in for (index, vector) in
arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
{ {
if self.quantized { if self.quantized {
arroy::Writer::new(self.quantized_db(), index, dimension) hannoy::Writer::new(self.quantized_db(), index, dimension)
.add_item(wtxn, item_id, vector)? .add_item(wtxn, item_id, vector)?
} else { } else {
arroy::Writer::new(self.angular_db(), index, dimension) hannoy::Writer::new(self.angular_db(), index, dimension)
.add_item(wtxn, item_id, vector)? .add_item(wtxn, item_id, vector)?
} }
} }
@ -210,9 +212,9 @@ impl ArroyWrapper {
pub fn add_item( pub fn add_item(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
if self.quantized { if self.quantized {
self._add_item(wtxn, self.quantized_db(), item_id, vector) self._add_item(wtxn, self.quantized_db(), item_id, vector)
} else { } else {
@ -220,17 +222,17 @@ impl ArroyWrapper {
} }
} }
fn _add_item<D: arroy::Distance>( fn _add_item<D: hannoy::Distance>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
let dimension = vector.len(); let dimension = vector.len();
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension); let writer = hannoy::Writer::new(db, index, dimension);
if !writer.contains_item(wtxn, item_id)? { if !writer.contains_item(wtxn, item_id)? {
writer.add_item(wtxn, item_id, vector)?; writer.add_item(wtxn, item_id, vector)?;
break; break;
@ -245,10 +247,10 @@ impl ArroyWrapper {
pub fn add_item_in_store( pub fn add_item_in_store(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
store_id: u8, store_id: u8,
vector: &[f32], vector: &[f32],
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
if self.quantized { if self.quantized {
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
} else { } else {
@ -256,18 +258,18 @@ impl ArroyWrapper {
} }
} }
fn _add_item_in_store<D: arroy::Distance>( fn _add_item_in_store<D: hannoy::Distance>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
store_id: u8, store_id: u8,
vector: &[f32], vector: &[f32],
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
let dimension = vector.len(); let dimension = vector.len();
let index = arroy_store_for_embedder(self.embedder_index, store_id); let index = hannoy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimension); let writer = hannoy::Writer::new(db, index, dimension);
writer.add_item(wtxn, item_id, vector) writer.add_item(wtxn, item_id, vector)
} }
@ -276,14 +278,14 @@ impl ArroyWrapper {
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
dimension: usize, dimension: usize,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
writer.del_item(wtxn, item_id)?; writer.del_item(wtxn, item_id)?;
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
writer.del_item(wtxn, item_id)?; writer.del_item(wtxn, item_id)?;
} }
} }
@ -301,10 +303,10 @@ impl ArroyWrapper {
pub fn del_item_in_store( pub fn del_item_in_store(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<bool, arroy::Error> { ) -> Result<bool, hannoy::Error> {
if self.quantized { if self.quantized {
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
} else { } else {
@ -312,16 +314,16 @@ impl ArroyWrapper {
} }
} }
fn _del_item_in_store<D: arroy::Distance>( fn _del_item_in_store<D: hannoy::Distance>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<bool, arroy::Error> { ) -> Result<bool, hannoy::Error> {
let index = arroy_store_for_embedder(self.embedder_index, store_id); let index = hannoy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimensions); let writer = hannoy::Writer::new(db, index, dimensions);
writer.del_item(wtxn, item_id) writer.del_item(wtxn, item_id)
} }
@ -335,7 +337,7 @@ impl ArroyWrapper {
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
if self.quantized { if self.quantized {
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
} else { } else {
@ -343,15 +345,15 @@ impl ArroyWrapper {
} }
} }
fn _clear_store<D: arroy::Distance>( fn _clear_store<D: hannoy::Distance>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
let index = arroy_store_for_embedder(self.embedder_index, store_id); let index = hannoy_store_for_embedder(self.embedder_index, store_id);
let writer = arroy::Writer::new(db, index, dimensions); let writer = hannoy::Writer::new(db, index, dimensions);
writer.clear(wtxn) writer.clear(wtxn)
} }
@ -359,9 +361,9 @@ impl ArroyWrapper {
pub fn del_item( pub fn del_item(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<bool, arroy::Error> { ) -> Result<bool, hannoy::Error> {
if self.quantized { if self.quantized {
self._del_item(wtxn, self.quantized_db(), item_id, vector) self._del_item(wtxn, self.quantized_db(), item_id, vector)
} else { } else {
@ -369,37 +371,34 @@ impl ArroyWrapper {
} }
} }
fn _del_item<D: arroy::Distance>( fn _del_item<D: hannoy::Distance>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
item_id: arroy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<bool, arroy::Error> { ) -> Result<bool, hannoy::Error> {
let dimension = vector.len(); let dimension = vector.len();
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
let writer = arroy::Writer::new(db, index, dimension); let writer = hannoy::Writer::new(db, index, dimension);
let Some(candidate) = writer.item_vector(wtxn, item_id)? else { if writer.contains_item(wtxn, item_id)? {
continue;
};
if candidate == vector {
return writer.del_item(wtxn, item_id); return writer.del_item(wtxn, item_id);
} }
} }
Ok(false) Ok(false)
} }
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(wtxn)? { if writer.is_empty(wtxn)? {
continue; continue;
} }
writer.clear(wtxn)?; writer.clear(wtxn)?;
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
if writer.is_empty(wtxn)? { if writer.is_empty(wtxn)? {
continue; continue;
} }
@ -413,17 +412,17 @@ impl ArroyWrapper {
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
dimension: usize, dimension: usize,
item: arroy::ItemId, item: hannoy::ItemId,
) -> Result<bool, arroy::Error> { ) -> Result<bool, hannoy::Error> {
for index in arroy_store_range_for_embedder(self.embedder_index) { for index in hannoy_store_range_for_embedder(self.embedder_index) {
let contains = if self.quantized { let contains = if self.quantized {
let writer = arroy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(rtxn)? { if writer.is_empty(rtxn)? {
continue; continue;
} }
writer.contains_item(rtxn, item)? writer.contains_item(rtxn, item)?
} else { } else {
let writer = arroy::Writer::new(self.angular_db(), index, dimension); let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
if writer.is_empty(rtxn)? { if writer.is_empty(rtxn)? {
continue; continue;
} }
@ -442,7 +441,7 @@ impl ArroyWrapper {
item: ItemId, item: ItemId,
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
if self.quantized { if self.quantized {
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter)
} else { } else {
@ -450,19 +449,19 @@ impl ArroyWrapper {
} }
} }
fn _nns_by_item<D: arroy::Distance>( fn _nns_by_item<D: hannoy::Distance>(
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
item: ItemId, item: ItemId,
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
let mut results = Vec::new(); let mut results = Vec::new();
for reader in self.readers(rtxn, db) { for reader in self.readers(rtxn, db) {
let reader = reader?; let reader = reader?;
let mut searcher = reader.nns(limit); let mut searcher = reader.nns(limit, limit * 2); // TODO find better ef
if let Some(filter) = filter { if let Some(filter) = filter {
if reader.item_ids().is_disjoint(filter) { if reader.item_ids().is_disjoint(filter) {
continue; continue;
@ -484,7 +483,7 @@ impl ArroyWrapper {
vector: &[f32], vector: &[f32],
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
if self.quantized { if self.quantized {
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter)
} else { } else {
@ -492,19 +491,19 @@ impl ArroyWrapper {
} }
} }
fn _nns_by_vector<D: arroy::Distance>( fn _nns_by_vector<D: hannoy::Distance>(
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
db: arroy::Database<D>, db: hannoy::Database<D>,
vector: &[f32], vector: &[f32],
limit: usize, limit: usize,
filter: Option<&RoaringBitmap>, filter: Option<&RoaringBitmap>,
) -> Result<Vec<(ItemId, f32)>, arroy::Error> { ) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
let mut results = Vec::new(); let mut results = Vec::new();
for reader in self.readers(rtxn, db) { for reader in self.readers(rtxn, db) {
let reader = reader?; let reader = reader?;
let mut searcher = reader.nns(limit); let mut searcher = reader.nns(limit, limit * 2); // TODO find better ef
if let Some(filter) = filter { if let Some(filter) = filter {
if reader.item_ids().is_disjoint(filter) { if reader.item_ids().is_disjoint(filter) {
continue; continue;
@ -520,7 +519,7 @@ impl ArroyWrapper {
Ok(results) Ok(results)
} }
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, arroy::Error> { pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, hannoy::Error> {
let mut vectors = Vec::new(); let mut vectors = Vec::new();
if self.quantized { if self.quantized {
@ -539,19 +538,19 @@ impl ArroyWrapper {
Ok(vectors) Ok(vectors)
} }
fn angular_db(&self) -> arroy::Database<Cosine> { fn angular_db(&self) -> hannoy::Database<Cosine> {
self.database.remap_data_type() self.database.remap_data_type()
} }
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> { fn quantized_db(&self) -> hannoy::Database<BinaryQuantizedCosine> {
self.database.remap_data_type() self.database.remap_data_type()
} }
pub fn aggregate_stats( pub fn aggregate_stats(
&self, &self,
rtxn: &RoTxn, rtxn: &RoTxn,
stats: &mut ArroyStats, stats: &mut HannoyStats,
) -> Result<(), arroy::Error> { ) -> Result<(), hannoy::Error> {
if self.quantized { if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) { for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?; let reader = reader?;
@ -579,10 +578,11 @@ impl ArroyWrapper {
} }
#[derive(Debug, Default, Clone)] #[derive(Debug, Default, Clone)]
pub struct ArroyStats { pub struct HannoyStats {
pub number_of_embeddings: u64, pub number_of_embeddings: u64,
pub documents: RoaringBitmap, pub documents: RoaringBitmap,
} }
/// One or multiple embeddings stored consecutively in a flat vector. /// One or multiple embeddings stored consecutively in a flat vector.
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct Embeddings<F> { pub struct Embeddings<F> {
@ -1227,11 +1227,11 @@ pub const fn is_cuda_enabled() -> bool {
cfg!(feature = "cuda") cfg!(feature = "cuda")
} }
fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> { fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
(0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id)) (0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id))
} }
fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { fn hannoy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
let embedder_id = (embedder_id as u16) << 8; let embedder_id = (embedder_id as u16) << 8;
embedder_id | (store_id as u16) embedder_id | (store_id as u16)
} }