Adopt neutral terminology where arroy/hannoy would be confusing

This commit is contained in:
Louis Dureuil
2025-09-03 15:08:40 +02:00
parent 0faf495173
commit 13df964564
10 changed files with 75 additions and 78 deletions

View File

@ -320,7 +320,7 @@ async fn binary_quantize_clear_documents() {
}
"###);
// Make sure the hannoy DB has been cleared
// Make sure the vector DB has been cleared
let (documents, _code) =
index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await;
snapshot!(documents, @r#"

View File

@ -10,11 +10,11 @@ use std::str::FromStr;
use meili_snap::{json_string, snapshot};
use meilisearch::option::MaxThreads;
pub use rest::create_mock;
use crate::common::index::Index;
use crate::common::{default_settings, GetAllDocumentsOptions, Server};
use crate::json;
pub use rest::create_mock;
pub async fn get_server_vector() -> Server {
Server::new().await
@ -684,7 +684,7 @@ async fn clear_documents() {
}
"###);
// Make sure the hannoy DB has been cleared
// Make sure the vector DB has been cleared
let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await;
snapshot!(documents, @r#"

View File

@ -236,7 +236,7 @@ async fn reset_embedder_documents() {
}
"###);
// Make sure the hannoy DB has been cleared
// Make sure the vector DB has been cleared
let (documents, _code) =
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await;
snapshot!(json_string!(documents), @r###"

View File

@ -142,7 +142,7 @@ enum Command {
#[derive(Clone, ValueEnum)]
enum IndexPart {
/// Will make the hannoy index hot.
/// Will make the vector index hot.
Hannoy,
}

View File

@ -178,7 +178,7 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps an embedder name to its id in the hannoy store.
/// Maps an embedder name to its id in the vector store.
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
/// Vector store based on hannoy™.
pub vector_store: hannoy::Database<Unspecified>,
@ -1881,7 +1881,7 @@ impl Index {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_store: vector_hannoy,
vector_store,
embedder_category_id,
documents,
} = self;
@ -1952,7 +1952,7 @@ impl Index {
"field_id_docid_facet_strings",
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
);
sizes.insert("vector_hannoy", vector_hannoy.stat(rtxn).map(compute_size)?);
sizes.insert("vector_store", vector_store.stat(rtxn).map(compute_size)?);
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);

View File

@ -505,7 +505,7 @@ where
for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn;
let vector_hannoy = self.index.vector_store;
let vector_store = self.index.vector_store;
let cancel = &self.should_abort;
let embedder_index =
@ -525,7 +525,7 @@ where
pool.install(|| {
let mut writer =
VectorStore::new(backend, vector_hannoy, embedder_index, was_quantized);
VectorStore::new(backend, vector_store, embedder_index, was_quantized);
writer.build_and_quantize(
wtxn,
// In the settings we don't have any progress to share

View File

@ -948,13 +948,13 @@ impl<'a, 'i> Transform<'a, 'i> {
else {
continue;
};
let hannoy = VectorStore::new(
let vector_store = VectorStore::new(
backend,
self.index.vector_store,
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = hannoy.dimensions(wtxn)? else {
let Some(dimensions) = vector_store.dimensions(wtxn)? else {
continue;
};
for fragment_id in fragment_ids {
@ -962,17 +962,17 @@ impl<'a, 'i> Transform<'a, 'i> {
if infos.embedding_status.user_provided_docids().is_empty() {
// no user provided: clear store
hannoy.clear_store(wtxn, *fragment_id, dimensions)?;
vector_store.clear_store(wtxn, *fragment_id, dimensions)?;
continue;
}
// some user provided, remove only the ids that are not user provided
let to_delete = hannoy.items_in_store(wtxn, *fragment_id, |items| {
let to_delete = vector_store.items_in_store(wtxn, *fragment_id, |items| {
items - infos.embedding_status.user_provided_docids()
})?;
for to_delete in to_delete {
hannoy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
vector_store.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
}
}
}

View File

@ -255,9 +255,9 @@ impl<'a> From<FrameGrantR<'a>> for FrameWithHeader<'a> {
#[repr(u8)]
pub enum EntryHeader {
DbOperation(DbOperation),
HannoyDeleteVector(HannoyDeleteVector),
HannoySetVectors(HannoySetVectors),
HannoySetVector(HannoySetVector),
DeleteVector(DeleteVector),
SetVectors(SetVectors),
SetVector(SetVector),
}
impl EntryHeader {
@ -268,9 +268,9 @@ impl EntryHeader {
const fn variant_id(&self) -> u8 {
match self {
EntryHeader::DbOperation(_) => 0,
EntryHeader::HannoyDeleteVector(_) => 1,
EntryHeader::HannoySetVectors(_) => 2,
EntryHeader::HannoySetVector(_) => 3,
EntryHeader::DeleteVector(_) => 1,
EntryHeader::SetVectors(_) => 2,
EntryHeader::SetVector(_) => 3,
}
}
@ -286,26 +286,26 @@ impl EntryHeader {
}
const fn total_delete_vector_size() -> usize {
Self::variant_size() + mem::size_of::<HannoyDeleteVector>()
Self::variant_size() + mem::size_of::<DeleteVector>()
}
/// The `dimensions` corresponds to the number of `f32` in the embedding.
fn total_set_vectors_size(count: usize, dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<HannoySetVectors>() + embedding_size * count
Self::variant_size() + mem::size_of::<SetVectors>() + embedding_size * count
}
fn total_set_vector_size(dimensions: usize) -> usize {
let embedding_size = dimensions * mem::size_of::<f32>();
Self::variant_size() + mem::size_of::<HannoySetVector>() + embedding_size
Self::variant_size() + mem::size_of::<SetVector>() + embedding_size
}
fn header_size(&self) -> usize {
let payload_size = match self {
EntryHeader::DbOperation(op) => mem::size_of_val(op),
EntryHeader::HannoyDeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::HannoySetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::HannoySetVector(asv) => mem::size_of_val(asv),
EntryHeader::DeleteVector(adv) => mem::size_of_val(adv),
EntryHeader::SetVectors(asvs) => mem::size_of_val(asvs),
EntryHeader::SetVector(asv) => mem::size_of_val(asv),
};
Self::variant_size() + payload_size
}
@ -319,19 +319,19 @@ impl EntryHeader {
EntryHeader::DbOperation(header)
}
1 => {
let header_bytes = &remaining[..mem::size_of::<HannoyDeleteVector>()];
let header_bytes = &remaining[..mem::size_of::<DeleteVector>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::HannoyDeleteVector(header)
EntryHeader::DeleteVector(header)
}
2 => {
let header_bytes = &remaining[..mem::size_of::<HannoySetVectors>()];
let header_bytes = &remaining[..mem::size_of::<SetVectors>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::HannoySetVectors(header)
EntryHeader::SetVectors(header)
}
3 => {
let header_bytes = &remaining[..mem::size_of::<HannoySetVector>()];
let header_bytes = &remaining[..mem::size_of::<SetVector>()];
let header = checked::pod_read_unaligned(header_bytes);
EntryHeader::HannoySetVector(header)
EntryHeader::SetVector(header)
}
id => panic!("invalid variant id: {id}"),
}
@ -341,9 +341,9 @@ impl EntryHeader {
let (first, remaining) = header_bytes.split_first_mut().unwrap();
let payload_bytes = match self {
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
EntryHeader::HannoyDeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::HannoySetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::HannoySetVector(asv) => bytemuck::bytes_of(asv),
EntryHeader::DeleteVector(adv) => bytemuck::bytes_of(adv),
EntryHeader::SetVectors(asvs) => bytemuck::bytes_of(asvs),
EntryHeader::SetVector(asv) => bytemuck::bytes_of(asv),
};
*first = self.variant_id();
remaining.copy_from_slice(payload_bytes);
@ -378,7 +378,7 @@ impl DbOperation {
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
#[repr(transparent)]
pub struct HannoyDeleteVector {
pub struct DeleteVector {
pub docid: DocumentId,
}
@ -386,13 +386,13 @@ pub struct HannoyDeleteVector {
#[repr(C)]
/// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s.
pub struct HannoySetVectors {
pub struct SetVectors {
pub docid: DocumentId,
pub embedder_id: u8,
_padding: [u8; 3],
}
impl HannoySetVectors {
impl SetVectors {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..]
@ -416,14 +416,14 @@ impl HannoySetVectors {
#[repr(C)]
/// The embeddings are in the remaining space and represents
/// non-aligned [f32] each with dimensions f32s.
pub struct HannoySetVector {
pub struct SetVector {
pub docid: DocumentId,
pub embedder_id: u8,
pub extractor_id: u8,
_padding: [u8; 2],
}
impl HannoySetVector {
impl SetVector {
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
&frame[skip..]
@ -553,7 +553,7 @@ impl<'b> ExtractorBbqueueSender<'b> {
let refcell = self.producers.get().unwrap();
let mut producer = refcell.0.borrow_mut_or_yield();
let payload_header = EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid });
let payload_header = EntryHeader::DeleteVector(DeleteVector { docid });
let total_length = EntryHeader::total_delete_vector_size();
if total_length > max_grant {
panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)");
@ -589,8 +589,8 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all
let dimensions = embeddings.first().map_or(0, |emb| emb.len());
let hannoy_set_vector = HannoySetVectors { docid, embedder_id, _padding: [0; 3] };
let payload_header = EntryHeader::HannoySetVectors(hannoy_set_vector);
let set_vectors = SetVectors { docid, embedder_id, _padding: [0; 3] };
let payload_header = EntryHeader::SetVectors(set_vectors);
let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions);
if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
@ -650,9 +650,8 @@ impl<'b> ExtractorBbqueueSender<'b> {
// to zero to allocate no extra space at all
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
let hannoy_set_vector =
HannoySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::HannoySetVector(hannoy_set_vector);
let set_vector = SetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
let payload_header = EntryHeader::SetVector(set_vector);
let total_length = EntryHeader::total_set_vector_size(dimensions);
if total_length > max_grant {
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;

View File

@ -67,7 +67,7 @@ where
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
let hannoy_memory = grenad_parameters.max_memory;
let vector_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@ -132,7 +132,7 @@ where
let vector_arroy = index.vector_store;
let backend = index.get_vector_store(wtxn)?;
let hannoy_writers: Result<HashMap<_, _>> = embedders
let vector_stores: Result<HashMap<_, _>> = embedders
.inner_as_ref()
.iter()
.map(|(embedder_name, runtime)| {
@ -155,10 +155,10 @@ where
})
.collect();
let mut hannoy_writers = hannoy_writers?;
let mut vector_stores = vector_stores?;
let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
write_to_db(writer_receiver, finished_extraction, index, wtxn, &vector_stores)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@ -172,8 +172,8 @@ where
wtxn,
indexing_context.progress,
index_embeddings,
hannoy_memory,
&mut hannoy_writers,
vector_memory,
&mut vector_stores,
None,
&indexing_context.must_stop_processing,
)
@ -229,7 +229,7 @@ where
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
let hannoy_memory = grenad_parameters.max_memory;
let vector_memory = grenad_parameters.max_memory;
let (grenad_parameters, total_bbbuffer_capacity) =
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
@ -286,7 +286,7 @@ where
let new_embedders = settings_delta.new_embedders();
let embedder_actions = settings_delta.embedder_actions();
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
let mut hannoy_writers = hannoy_writers_from_embedder_actions(
let mut vector_stores = vector_stores_from_embedder_actions(
index,
wtxn,
embedder_actions,
@ -295,7 +295,7 @@ where
)?;
let congestion =
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
write_to_db(writer_receiver, finished_extraction, index, wtxn, &vector_stores)?;
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
@ -309,8 +309,8 @@ where
wtxn,
indexing_context.progress,
index_embeddings,
hannoy_memory,
&mut hannoy_writers,
vector_memory,
&mut vector_stores,
Some(embedder_actions),
&indexing_context.must_stop_processing,
)
@ -340,7 +340,7 @@ where
Ok(congestion)
}
fn hannoy_writers_from_embedder_actions<'indexer>(
fn vector_stores_from_embedder_actions<'indexer>(
index: &Index,
rtxn: &RoTxn,
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,

View File

@ -23,7 +23,7 @@ pub fn write_to_db(
finished_extraction: &AtomicBool,
index: &Index,
wtxn: &mut RwTxn<'_>,
hannoy_writers: &HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
vector_stores: &HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
) -> Result<ChannelCongestion> {
// Used by by the HannoySetVector to copy the embedding into an
// aligned memory area, required by arroy to accept a new vector.
@ -56,7 +56,7 @@ pub fn write_to_db(
ReceiverAction::LargeVectors(large_vectors) => {
let LargeVectors { docid, embedder_id, .. } = large_vectors;
let (_, _, writer, dimensions) =
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
vector_stores.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions);
for embedding in large_vectors.read_embeddings(*dimensions) {
embeddings.push(embedding.to_vec()).unwrap();
@ -68,7 +68,7 @@ pub fn write_to_db(
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
) => {
let (_, _, writer, dimensions) =
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
vector_stores.get(&embedder_id).expect("requested a missing embedder");
let embedding = large_vector.read_embedding(*dimensions);
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
}
@ -80,12 +80,12 @@ pub fn write_to_db(
&mut writer_receiver,
index,
wtxn,
hannoy_writers,
vector_stores,
&mut aligned_embedding,
)?;
}
write_from_bbqueue(&mut writer_receiver, index, wtxn, hannoy_writers, &mut aligned_embedding)?;
write_from_bbqueue(&mut writer_receiver, index, wtxn, vector_stores, &mut aligned_embedding)?;
Ok(ChannelCongestion {
attempts: writer_receiver.sent_messages_attempts(),
@ -115,8 +115,8 @@ pub fn build_vectors<MSP>(
wtxn: &mut RwTxn<'_>,
progress: &Progress,
index_embeddings: Vec<IndexEmbeddingConfig>,
hannoy_memory: Option<usize>,
hannoy_writers: &mut HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
vector_memory: Option<usize>,
vector_stores: &mut HashMap<u8, (&str, &Embedder, VectorStore, usize)>,
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
must_stop_processing: &MSP,
) -> Result<()>
@ -129,7 +129,7 @@ where
let seed = rand::random();
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
for (_index, (embedder_name, _embedder, writer, dimensions)) in hannoy_writers {
for (_index, (embedder_name, _embedder, writer, dimensions)) in vector_stores {
let dimensions = *dimensions;
let is_being_quantized = embeder_actions
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
@ -140,7 +140,7 @@ where
&mut rng,
dimensions,
is_being_quantized,
hannoy_memory,
vector_memory,
must_stop_processing,
)?;
}
@ -181,7 +181,7 @@ pub fn write_from_bbqueue(
writer_receiver: &mut WriterBbqueueReceiver<'_>,
index: &Index,
wtxn: &mut RwTxn<'_>,
hannoy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, VectorStore, usize)>,
vector_stores: &HashMap<u8, (&str, &crate::vector::Embedder, VectorStore, usize)>,
aligned_embedding: &mut Vec<f32>,
) -> crate::Result<()> {
while let Some(frame_with_header) = writer_receiver.recv_frame() {
@ -221,17 +221,17 @@ pub fn write_from_bbqueue(
},
}
}
EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid }) => {
for (_index, (_name, _embedder, writer, dimensions)) in hannoy_writers {
EntryHeader::DeleteVector(DeleteVector { docid }) => {
for (_index, (_name, _embedder, writer, dimensions)) in vector_stores {
let dimensions = *dimensions;
writer.del_items(wtxn, dimensions, docid)?;
}
}
EntryHeader::HannoySetVectors(asvs) => {
let HannoySetVectors { docid, embedder_id, .. } = asvs;
EntryHeader::SetVectors(asvs) => {
let SetVectors { docid, embedder_id, .. } = asvs;
let frame = frame_with_header.frame();
let (_, _, writer, dimensions) =
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
vector_stores.get(&embedder_id).expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions);
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
writer.del_items(wtxn, *dimensions, docid)?;
@ -245,12 +245,10 @@ pub fn write_from_bbqueue(
writer.add_items(wtxn, docid, &embeddings)?;
}
}
EntryHeader::HannoySetVector(
asv @ HannoySetVector { docid, embedder_id, extractor_id, .. },
) => {
EntryHeader::SetVector(asv @ SetVector { docid, embedder_id, extractor_id, .. }) => {
let frame = frame_with_header.frame();
let (_, _, writer, dimensions) =
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
vector_stores.get(&embedder_id).expect("requested a missing embedder");
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
if embedding.is_empty() {