Integrate the hannoy progress

This commit is contained in:
Clément Renault
2025-08-11 18:05:17 +02:00
committed by Louis Dureuil
parent 5c464e9855
commit ca5dc1b032
6 changed files with 51 additions and 24 deletions

18
Cargo.lock generated
View File

@ -2604,7 +2604,7 @@ dependencies = [
[[package]]
name = "hannoy"
version = "0.0.2"
source = "git+https://github.com/nnethercott/hannoy?branch=main#93a24c4cdf712152c90d27a2898715f22942c35c"
source = "git+https://github.com/nnethercott/hannoy?branch=main#8d1846b188ed2cc8776fdb86805eefbfbde9ddd1"
dependencies = [
"bytemuck",
"byteorder",
@ -2617,6 +2617,7 @@ dependencies = [
"rayon",
"roaring",
"rustc-hash 2.1.1",
"steppe",
"thiserror 2.0.12",
"tinyvec",
"tracing",
@ -3056,9 +3057,9 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.9.0"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661"
dependencies = [
"equivalent",
"hashbrown 0.15.4",
@ -4005,6 +4006,7 @@ dependencies = [
"smallstr",
"smallvec",
"smartstring",
"steppe",
"tempfile",
"thiserror 2.0.12",
"thread_local",
@ -5866,6 +5868,16 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "steppe"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dead99cdf718f37bcd1d22dda9b498f35c5aa22894b755bfd94bf8c2daec9427"
dependencies = [
"convert_case 0.8.0",
"indexmap",
]
[[package]]
name = "strsim"
version = "0.10.0"

View File

@ -96,6 +96,7 @@ url = "2.5.4"
hashbrown = "0.15.4"
bumpalo = "3.18.1"
bumparaw-collections = "0.1.4"
steppe = { version = "0.4.0", default-features = false }
thread_local = "1.1.9"
allocator-api2 = "0.3.0"
rustc-hash = "2.1.1"

View File

@ -96,14 +96,6 @@ impl Progress {
durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect()
}
// TODO: ideally we should expose the progress in a way that let arroy use it directly
// pub(crate) fn update_progress_from_hannoy(&self, progress: hannoy::WriterProgress) {
// self.update_progress(progress.main);
// if let Some(sub) = progress.sub {
// self.update_progress(sub);
// }
// }
}
/// Generate the names associated with the durations and push them.
@ -317,3 +309,27 @@ impl<U: Send + Sync + 'static> Step for VariableNameStep<U> {
// self.max
// }
// }
// Integration with steppe
impl steppe::Progress for Progress {
fn update(&self, sub_progress: impl steppe::Step) {
self.update_progress(Compat(sub_progress));
}
}
struct Compat<T: steppe::Step>(T);
impl<T: steppe::Step> Step for Compat<T> {
fn name(&self) -> Cow<'static, str> {
self.0.name().into()
}
fn current(&self) -> u32 {
self.0.current().try_into().unwrap_or(u32::MAX)
}
fn total(&self) -> u32 {
self.0.total().try_into().unwrap_or(u32::MAX)
}
}

View File

@ -526,7 +526,7 @@ where
writer.build_and_quantize(
wtxn,
// In the settings we don't have any progress to share
&Progress::default(),
Progress::default(),
&mut rng,
dimension,
is_quantizing,

View File

@ -136,7 +136,7 @@ where
.unwrap_or(false);
writer.build_and_quantize(
wtxn,
progress,
progress.clone(),
&mut rng,
dimensions,
is_being_quantized,

View File

@ -140,7 +140,7 @@ impl VectorStore {
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
&mut self,
wtxn: &mut RwTxn,
progress: &Progress,
progress: Progress,
rng: &mut R,
dimension: usize,
quantizing: bool,
@ -151,12 +151,12 @@ impl VectorStore {
if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.need_build(wtxn)? {
writer
.builder(rng)
// .progress(|step| progress.update_progress_from_hannoy(step))
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
.cancel(cancel)
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
} else if writer.is_empty(wtxn)? {
continue;
}
@ -169,18 +169,16 @@ impl VectorStore {
// sensitive.
if quantizing && !self.quantized {
let writer = writer.prepare_changing_distance::<Hamming>(wtxn)?;
writer
.builder(rng)
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
// .progress(|step| progress.update_progress_from_hannoy(step))
.cancel(cancel)
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
} else if writer.need_build(wtxn)? {
writer
.builder(rng)
let mut builder = writer.builder(rng).progress(progress.clone());
builder
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
// .progress(|step| progress.update_progress_from_hannoy(step))
.cancel(cancel)
.ef_construction(HANNOY_EF_CONSTRUCTION)
.build::<HANNOY_M, HANNOY_M0>(wtxn)?;