First version of Hannoy dumpless upgrade

This commit is contained in:
Clément Renault
2025-08-07 15:55:46 +02:00
parent 6a3089ad83
commit 417f124be7
4 changed files with 114 additions and 11 deletions

2
Cargo.lock generated
View File

@ -2603,7 +2603,7 @@ dependencies = [
[[package]]
name = "hannoy"
version = "0.0.2"
source = "git+https://github.com/nnethercott/hannoy?branch=main#93a24c4cdf712152c90d27a2898715f22942c35c"
source = "git+https://github.com/nnethercott/hannoy?branch=main#d7097b5214c211f5d2bb9d2643f3d9fb8ccb03e2"
dependencies = [
"bytemuck",
"byteorder",

View File

@ -3,15 +3,18 @@ mod v1_13;
mod v1_14;
mod v1_15;
mod v1_16;
mod v1_18;
use heed::RwTxn;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
use v1_14::Latest_V1_13_To_Latest_V1_14;
use v1_15::Latest_V1_14_To_Latest_V1_15;
use v1_16::Latest_V1_15_To_V1_16_0;
use v1_18::Latest_V1_17_To_V1_18_0;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::{Progress, VariableNameStep};
use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0;
use crate::{Index, InternalError, Result};
trait UpgradeIndex {
@ -34,6 +37,7 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[
&Latest_V1_13_To_Latest_V1_14 {},
&Latest_V1_14_To_Latest_V1_15 {},
&Latest_V1_15_To_V1_16_0 {},
&Latest_V1_17_To_V1_18_0 {},
// This is the last upgrade function, it will be called when the index is up to date.
// any other upgrade function should be added before this one.
&ToCurrentNoOp {},

View File

@ -0,0 +1,34 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::progress::Progress;
use crate::vector::VectorStore;
use crate::{Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct Latest_V1_17_To_V1_18_0();
impl UpgradeIndex for Latest_V1_17_To_V1_18_0 {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
let embedding_configs = index.embedding_configs();
for config in embedding_configs.embedding_configs(wtxn)? {
// TODO use the embedder name to display progress
let quantized = config.config.quantized();
let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap();
let vector_store = VectorStore::new(index.vector_store, embedder_id, quantized);
vector_store.convert_from_arroy(wtxn)?;
}
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 18, 0)
}
}

View File

@ -8,6 +8,7 @@ use hannoy::distances::{BinaryQuantizedCosine, Cosine};
use hannoy::ItemId;
use heed::{RoTxn, RwTxn, Unspecified};
use ordered_float::OrderedFloat;
use rand::SeedableRng;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use utoipa::ToSchema;
@ -69,7 +70,7 @@ impl VectorStore {
rtxn: &'a RoTxn<'a>,
db: hannoy::Database<D>,
) -> impl Iterator<Item = Result<hannoy::Reader<'a, D>, hannoy::Error>> + 'a {
hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
match hannoy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)),
@ -82,6 +83,24 @@ impl VectorStore {
})
}
fn arroy_readers<'a, D: arroy::Distance>(
&'a self,
rtxn: &'a RoTxn<'a>,
db: arroy::Database<D>,
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a {
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
match arroy::Reader::open(rtxn, index, db) {
Ok(reader) => match reader.is_empty(rtxn) {
Ok(false) => Some(Ok(reader)),
Ok(true) => None,
Err(e) => Some(Err(e)),
},
Err(arroy::Error::MissingMetadata(_)) => None,
Err(e) => Some(Err(e)),
}
})
}
/// The item ids that are present in the store specified by its id.
///
/// The ids are accessed via a lambda to avoid lifetime shenanigans.
@ -136,6 +155,44 @@ impl VectorStore {
}
}
pub fn convert_from_arroy(&self, wtxn: &mut RwTxn) -> crate::Result<()> {
if self.quantized {
let dimensions = self
.arroy_readers(wtxn, self.arroy_quantized_db())
.next()
.transpose()?
.map(|reader| reader.dimensions());
let Some(dimensions) = dimensions else { return Ok(()) };
for index in vector_store_range_for_embedder(self.embedder_index) {
let mut rng = rand::rngs::StdRng::from_entropy();
let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions);
writer.prepare_arroy_conversion(wtxn)?;
writer.builder(&mut rng).build::<HANNOY_M, HANNOY_M0>(wtxn)?;
}
Ok(())
} else {
let dimensions = self
.arroy_readers(wtxn, self.arroy_angular_db())
.next()
.transpose()?
.map(|reader| reader.dimensions());
let Some(dimensions) = dimensions else { return Ok(()) };
for index in vector_store_range_for_embedder(self.embedder_index) {
let mut rng = rand::rngs::StdRng::from_entropy();
let writer = hannoy::Writer::new(self.angular_db(), index, dimensions);
writer.prepare_arroy_conversion(wtxn)?;
writer.builder(&mut rng).build::<HANNOY_M, HANNOY_M0>(wtxn)?;
}
Ok(())
}
}
#[allow(clippy::too_many_arguments)]
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
&mut self,
@ -147,7 +204,7 @@ impl VectorStore {
hannoy_memory: Option<usize>,
cancel: &(impl Fn() -> bool + Sync + Send),
) -> Result<(), hannoy::Error> {
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.need_build(wtxn)? {
@ -204,7 +261,7 @@ impl VectorStore {
) -> Result<(), hannoy::Error> {
let dimension = embeddings.dimension();
for (index, vector) in
hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
{
if self.quantized {
hannoy::Writer::new(self.quantized_db(), index, dimension)
@ -240,7 +297,7 @@ impl VectorStore {
) -> Result<(), hannoy::Error> {
let dimension = vector.len();
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
let writer = hannoy::Writer::new(db, index, dimension);
if !writer.contains_item(wtxn, item_id)? {
writer.add_item(wtxn, item_id, vector)?;
@ -289,7 +346,7 @@ impl VectorStore {
dimension: usize,
item_id: hannoy::ItemId,
) -> Result<(), hannoy::Error> {
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
writer.del_item(wtxn, item_id)?;
@ -389,7 +446,7 @@ impl VectorStore {
) -> Result<bool, hannoy::Error> {
let dimension = vector.len();
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
let writer = hannoy::Writer::new(db, index, dimension);
if writer.contains_item(wtxn, item_id)? {
return writer.del_item(wtxn, item_id);
@ -399,7 +456,7 @@ impl VectorStore {
}
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(wtxn)? {
@ -423,7 +480,7 @@ impl VectorStore {
dimension: usize,
item: hannoy::ItemId,
) -> Result<bool, hannoy::Error> {
for index in hannoy_store_range_for_embedder(self.embedder_index) {
for index in vector_store_range_for_embedder(self.embedder_index) {
let contains = if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
if writer.is_empty(rtxn)? {
@ -557,6 +614,14 @@ impl VectorStore {
self.database.remap_data_type()
}
fn arroy_angular_db(&self) -> arroy::Database<arroy::distances::Cosine> {
self.database.remap_types()
}
fn arroy_quantized_db(&self) -> arroy::Database<arroy::distances::BinaryQuantizedCosine> {
self.database.remap_types()
}
pub fn aggregate_stats(
&self,
rtxn: &RoTxn,
@ -1238,7 +1303,7 @@ pub const fn is_cuda_enabled() -> bool {
cfg!(feature = "cuda")
}
fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
(0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id))
}