remove-me: Debugging the missing key hannoy bug

This commit is contained in:
Kerollmops
2025-08-13 18:06:39 +02:00
parent df578a5118
commit 126bd99205
4 changed files with 39 additions and 21 deletions

2
Cargo.lock generated
View File

@ -2603,8 +2603,6 @@ dependencies = [
[[package]] [[package]]
name = "hannoy" name = "hannoy"
version = "0.0.3" version = "0.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0cac6ebc04fc7246356d29908b55315c26c695a2ea2f692de9f72c0ac61ca1b1"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",

View File

@ -88,7 +88,7 @@ rhai = { version = "1.22.2", features = [
"sync", "sync",
] } ] }
arroy = "0.6.1" arroy = "0.6.1"
hannoy = "0.0.3" hannoy = { path = "../../../hannoy" }
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.41" tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] } ureq = { version = "2.12.1", features = ["json"] }

View File

@ -393,20 +393,17 @@ fn delete_old_embedders_and_fragments<SD>(
where where
SD: SettingsDelta, SD: SettingsDelta,
{ {
let index_version = index.get_version(wtxn)?.unwrap();
for action in settings_delta.embedder_actions().values() { for action in settings_delta.embedder_actions().values() {
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
continue; continue;
}; };
let reader = VectorStore::new( let vector_store =
index.get_version(wtxn)?.unwrap(), VectorStore::new(index_version, index.vector_store, *embedder_id, action.was_quantized);
index.vector_store, let Some(dimensions) = vector_store.dimensions(wtxn)? else {
*embedder_id,
action.was_quantized,
);
let Some(dimensions) = reader.dimensions(wtxn)? else {
continue; continue;
}; };
reader.clear(wtxn, dimensions)?; vector_store.clear(wtxn, dimensions)?;
} }
// remove all vectors for the specified fragments // remove all vectors for the specified fragments
@ -418,13 +415,9 @@ where
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
continue; continue;
}; };
let arroy = VectorStore::new( let vector_store =
index.get_version(wtxn)?.unwrap(), VectorStore::new(index_version, index.vector_store, infos.embedder_id, was_quantized);
index.vector_store, let Some(dimensions) = vector_store.dimensions(wtxn)? else {
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = arroy.dimensions(wtxn)? else {
continue; continue;
}; };
for fragment_id in fragment_ids { for fragment_id in fragment_ids {
@ -432,17 +425,17 @@ where
if infos.embedding_status.user_provided_docids().is_empty() { if infos.embedding_status.user_provided_docids().is_empty() {
// no user provided: clear store // no user provided: clear store
arroy.clear_store(wtxn, *fragment_id, dimensions)?; vector_store.clear_store(wtxn, *fragment_id, dimensions)?;
continue; continue;
} }
// some user provided, remove only the ids that are not user provided // some user provided, remove only the ids that are not user provided
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { let to_delete = vector_store.items_in_store(wtxn, *fragment_id, |items| {
items - infos.embedding_status.user_provided_docids() items - infos.embedding_status.user_provided_docids()
})?; })?;
for to_delete in to_delete { for to_delete in to_delete {
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; vector_store.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
} }
} }
} }

View File

@ -256,6 +256,7 @@ impl VectorStore {
hannoy_memory: Option<usize>, hannoy_memory: Option<usize>,
cancel: &(impl Fn() -> bool + Sync + Send), cancel: &(impl Fn() -> bool + Sync + Send),
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!("Build and quantize embedder_index={}", self.embedder_index);
for index in vector_store_range_for_embedder(self.embedder_index) { for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
@ -309,6 +310,10 @@ impl VectorStore {
item_id: hannoy::ItemId, item_id: hannoy::ItemId,
embeddings: &Embeddings<f32>, embeddings: &Embeddings<f32>,
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!(
"Adding item_id={item_id} to all stores in embedder_index={}",
self.embedder_index
);
let dimension = embeddings.dimension(); let dimension = embeddings.dimension();
for (index, vector) in for (index, vector) in
vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
@ -331,6 +336,7 @@ impl VectorStore {
item_id: hannoy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!("Adding item_id={item_id} and embedder_index={}", self.embedder_index);
if self.quantized { if self.quantized {
self._add_item(wtxn, self.quantized_db(), item_id, vector) self._add_item(wtxn, self.quantized_db(), item_id, vector)
} else { } else {
@ -367,6 +373,10 @@ impl VectorStore {
store_id: u8, store_id: u8,
vector: &[f32], vector: &[f32],
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!(
"Adding item_id={item_id} in store_id={store_id} and embedder_index={}",
self.embedder_index
);
if self.quantized { if self.quantized {
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
} else { } else {
@ -396,6 +406,10 @@ impl VectorStore {
dimension: usize, dimension: usize,
item_id: hannoy::ItemId, item_id: hannoy::ItemId,
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!(
"Deleting item_id={item_id} in all stores in embedder_index={}",
self.embedder_index
);
for index in vector_store_range_for_embedder(self.embedder_index) { for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
@ -423,6 +437,10 @@ impl VectorStore {
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<bool, hannoy::Error> { ) -> Result<bool, hannoy::Error> {
eprintln!(
"Deleting item_id={item_id} in store_id={store_id} and embedder_index={}",
self.embedder_index
);
if self.quantized { if self.quantized {
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
} else { } else {
@ -454,6 +472,10 @@ impl VectorStore {
store_id: u8, store_id: u8,
dimensions: usize, dimensions: usize,
) -> Result<(), hannoy::Error> { ) -> Result<(), hannoy::Error> {
eprintln!(
"Clearing items in store_id={store_id} and embedder_index={}",
self.embedder_index
);
if self.quantized { if self.quantized {
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
} else { } else {
@ -480,6 +502,10 @@ impl VectorStore {
item_id: hannoy::ItemId, item_id: hannoy::ItemId,
vector: &[f32], vector: &[f32],
) -> Result<bool, hannoy::Error> { ) -> Result<bool, hannoy::Error> {
eprintln!(
"Deleting item_id={item_id} from all stores in embedder_index={}",
self.embedder_index
);
if self.quantized { if self.quantized {
self._del_item(wtxn, self.quantized_db(), item_id, vector) self._del_item(wtxn, self.quantized_db(), item_id, vector)
} else { } else {
@ -506,6 +532,7 @@ impl VectorStore {
} }
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> { pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
eprintln!("Clearing all items from embedder_index={}", self.embedder_index);
for index in vector_store_range_for_embedder(self.embedder_index) { for index in vector_store_range_for_embedder(self.embedder_index) {
if self.quantized { if self.quantized {
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension); let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);