From 5bef2f4d860976d70e6a6560c1469445dbc501a8 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Mon, 15 Sep 2025 16:10:56 +0200 Subject: [PATCH 1/4] Update arroy-hannoy conversion internals --- crates/milli/src/vector/store.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/vector/store.rs b/crates/milli/src/vector/store.rs index d4d3f26cb..667d6b37d 100644 --- a/crates/milli/src/vector/store.rs +++ b/crates/milli/src/vector/store.rs @@ -1046,6 +1046,10 @@ impl VectorStore { where R: rand::Rng + rand::SeedableRng, { + // No work if distances are the same + if AD::name() == HD::name() { + return Ok(()); + } for index in vector_store_range_for_embedder(self.embedder_index) { let arroy_reader: arroy::Reader = match arroy::Reader::open(arroy_rtxn, index, self.database.remap_types()) { @@ -1084,6 +1088,10 @@ impl VectorStore { where R: rand::Rng + rand::SeedableRng, { + // No work if distances are the same + if AD::name() == HD::name() { + return Ok(()); + } for index in vector_store_range_for_embedder(self.embedder_index) { let hannoy_reader: hannoy::Reader = match hannoy::Reader::open(hannoy_rtxn, index, self.database.remap_types()) { @@ -1098,12 +1106,8 @@ impl VectorStore { arroy_writer.clear(arroy_wtxn)?; for entry in hannoy_reader.iter(hannoy_rtxn)? { let (item, mut vector) = entry?; - // hannoy bug? the `vector` here can be longer than `dimensions`. - // workaround: truncating. - if vector.len() > dimensions { - vector.truncate(dimensions); - } - // arroy and hannoy disagreement over the 0 value + debug_assert!(vector.len() == dimensions); + // arroy and hannoy disagreement over the 0 value if distance is Hamming // - arroy does: // - if x >= 0 => 1 // - if x < 0 => -1 From a47888f02cec8780cd2a08f098f79e72f6f46535 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Tue, 16 Sep 2025 11:02:46 +0200 Subject: [PATCH 2/4] bump hannoy to 0.6 --- Cargo.lock | 14 ++++++++++++-- crates/milli/Cargo.toml | 2 +- crates/milli/src/vector/store.rs | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 64d80186b..985cfaa97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2613,14 +2613,15 @@ dependencies = [ [[package]] name = "hannoy" -version = "0.0.5" +version = "0.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b6a412d145918473a8257706599a1088c505047eef9cc6c63c494c95786044f" +checksum = "6079f3d1f94be72564b6c61bd565bdfc6458ce0f0e0f66c4fe4799ccc2b932d0" dependencies = [ "bytemuck", "byteorder", "hashbrown 0.15.5", "heed", + "madvise", "min-max-heap", "page_size", "papaya", @@ -3749,6 +3750,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" +[[package]] +name = "madvise" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e1e75c3c34c2b34cec9f127418cb35240c7ebee5de36a51437e6b382c161b86" +dependencies = [ + "libc", +] + [[package]] name = "manifest-dir-macros" version = "0.1.18" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index a76deea67..7832ace33 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -88,7 +88,7 @@ rhai = { version = "1.22.2", features = [ "sync", ] } arroy = "0.6.3" -hannoy = "0.0.5" +hannoy = { version = "0.0.6", features = ["arroy"] } rand = "0.8.5" tracing = "0.1.41" ureq = { version = "2.12.1", features = ["json"] } diff --git a/crates/milli/src/vector/store.rs b/crates/milli/src/vector/store.rs index 667d6b37d..76bd6ecfe 100644 --- a/crates/milli/src/vector/store.rs +++ b/crates/milli/src/vector/store.rs @@ -705,7 +705,7 @@ impl VectorStore { &'a self, rtxn: &'a RoTxn<'a>, db: hannoy::Database, - ) -> impl Iterator, hannoy::Error>> + 'a { + ) -> impl Iterator, hannoy::Error>> + 'a { vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match hannoy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { From f9ffb8ada5f58c7b791aaa4b971507ffb9be7ebf Mon Sep 17 00:00:00 2001 From: nnethercott Date: Tue, 16 Sep 2025 12:00:36 +0200 Subject: [PATCH 3/4] bump from hannoy 0.0.6 to 0.0.7 --- Cargo.lock | 4 ++-- crates/milli/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 985cfaa97..914cf5259 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2613,9 +2613,9 @@ dependencies = [ [[package]] name = "hannoy" -version = "0.0.6" +version = "0.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6079f3d1f94be72564b6c61bd565bdfc6458ce0f0e0f66c4fe4799ccc2b932d0" +checksum = "da289f8777194676602b38559b92a0e36e714e06bb5f9fed1ef88da116b23811" dependencies = [ "bytemuck", "byteorder", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 7832ace33..696208283 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -88,7 +88,7 @@ rhai = { version = "1.22.2", features = [ "sync", ] } arroy = "0.6.3" -hannoy = { version = "0.0.6", features = ["arroy"] } +hannoy = { version = "0.0.7", features = ["arroy"] } rand = "0.8.5" tracing = "0.1.41" ureq = { version = "2.12.1", features = ["json"] } From 7a6cf30cb23b452ff9714d6a342af7a5d30bddc6 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Thu, 18 Sep 2025 11:23:57 +0200 Subject: [PATCH 4/4] bump hannoy to 0.0.8 --- Cargo.lock | 4 ++-- crates/milli/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 914cf5259..ec5e9a531 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2613,9 +2613,9 @@ dependencies = [ [[package]] name = "hannoy" -version = "0.0.7" +version = "0.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da289f8777194676602b38559b92a0e36e714e06bb5f9fed1ef88da116b23811" +checksum = "0dba13a271c49a119a97862ebf0a74131d879832868400d9fcd937b790058fdd" dependencies = [ "bytemuck", "byteorder", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 696208283..663ec7e6f 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -88,7 +88,7 @@ rhai = { version = "1.22.2", features = [ "sync", ] } arroy = "0.6.3" -hannoy = { version = "0.0.7", features = ["arroy"] } +hannoy = { version = "0.0.8", features = ["arroy"] } rand = "0.8.5" tracing = "0.1.41" ureq = { version = "2.12.1", features = ["json"] }