mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-06 04:36:32 +00:00
First version of Hannoy dumpless upgrade
This commit is contained in:
@ -8,6 +8,7 @@ use hannoy::distances::{Cosine, Hamming};
|
||||
use hannoy::ItemId;
|
||||
use heed::{RoTxn, RwTxn, Unspecified};
|
||||
use ordered_float::OrderedFloat;
|
||||
use rand::SeedableRng as _;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
@ -64,12 +65,30 @@ impl VectorStore {
|
||||
self.embedder_index
|
||||
}
|
||||
|
||||
fn arroy_readers<'a, D: arroy::Distance>(
|
||||
&'a self,
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
db: arroy::Database<D>,
|
||||
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a {
|
||||
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
||||
match arroy::Reader::open(rtxn, index, db) {
|
||||
Ok(reader) => match reader.is_empty(rtxn) {
|
||||
Ok(false) => Some(Ok(reader)),
|
||||
Ok(true) => None,
|
||||
Err(e) => Some(Err(e)),
|
||||
},
|
||||
Err(arroy::Error::MissingMetadata(_)) => None,
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn readers<'a, D: hannoy::Distance>(
|
||||
&'a self,
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
db: hannoy::Database<D>,
|
||||
) -> impl Iterator<Item = Result<hannoy::Reader<'a, D>, hannoy::Error>> + 'a {
|
||||
hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
||||
vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
||||
match hannoy::Reader::open(rtxn, index, db) {
|
||||
Ok(reader) => match reader.is_empty(rtxn) {
|
||||
Ok(false) => Some(Ok(reader)),
|
||||
@ -136,6 +155,46 @@ impl VectorStore {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn convert_from_arroy(&self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
if self.quantized {
|
||||
let dimensions = self
|
||||
.arroy_readers(wtxn, self.arroy_quantized_db())
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|reader| reader.dimensions());
|
||||
|
||||
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions);
|
||||
let mut builder = writer.builder(&mut rng);
|
||||
builder.prepare_arroy_conversion(wtxn)?;
|
||||
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
} else {
|
||||
let dimensions = self
|
||||
.arroy_readers(wtxn, self.arroy_angular_db())
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|reader| reader.dimensions());
|
||||
|
||||
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||
let writer = hannoy::Writer::new(self.angular_db(), index, dimensions);
|
||||
let mut builder = writer.builder(&mut rng);
|
||||
builder.prepare_arroy_conversion(wtxn)?;
|
||||
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn build_and_quantize<R: rand::Rng + rand::SeedableRng>(
|
||||
&mut self,
|
||||
@ -147,7 +206,7 @@ impl VectorStore {
|
||||
hannoy_memory: Option<usize>,
|
||||
cancel: &(impl Fn() -> bool + Sync + Send),
|
||||
) -> Result<(), hannoy::Error> {
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.need_build(wtxn)? {
|
||||
@ -202,7 +261,7 @@ impl VectorStore {
|
||||
) -> Result<(), hannoy::Error> {
|
||||
let dimension = embeddings.dimension();
|
||||
for (index, vector) in
|
||||
hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||
vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||
{
|
||||
if self.quantized {
|
||||
hannoy::Writer::new(self.quantized_db(), index, dimension)
|
||||
@ -238,7 +297,7 @@ impl VectorStore {
|
||||
) -> Result<(), hannoy::Error> {
|
||||
let dimension = vector.len();
|
||||
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
let writer = hannoy::Writer::new(db, index, dimension);
|
||||
if !writer.contains_item(wtxn, item_id)? {
|
||||
writer.add_item(wtxn, item_id, vector)?;
|
||||
@ -287,7 +346,7 @@ impl VectorStore {
|
||||
dimension: usize,
|
||||
item_id: hannoy::ItemId,
|
||||
) -> Result<(), hannoy::Error> {
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||
writer.del_item(wtxn, item_id)?;
|
||||
@ -387,7 +446,7 @@ impl VectorStore {
|
||||
) -> Result<bool, hannoy::Error> {
|
||||
let dimension = vector.len();
|
||||
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
let writer = hannoy::Writer::new(db, index, dimension);
|
||||
if writer.contains_item(wtxn, item_id)? {
|
||||
return writer.del_item(wtxn, item_id);
|
||||
@ -397,7 +456,7 @@ impl VectorStore {
|
||||
}
|
||||
|
||||
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.is_empty(wtxn)? {
|
||||
@ -421,7 +480,7 @@ impl VectorStore {
|
||||
dimension: usize,
|
||||
item: hannoy::ItemId,
|
||||
) -> Result<bool, hannoy::Error> {
|
||||
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||
let contains = if self.quantized {
|
||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.is_empty(rtxn)? {
|
||||
@ -547,6 +606,14 @@ impl VectorStore {
|
||||
Ok(vectors)
|
||||
}
|
||||
|
||||
fn arroy_angular_db(&self) -> arroy::Database<arroy::distances::Cosine> {
|
||||
self.database.remap_types()
|
||||
}
|
||||
|
||||
fn arroy_quantized_db(&self) -> arroy::Database<arroy::distances::BinaryQuantizedCosine> {
|
||||
self.database.remap_types()
|
||||
}
|
||||
|
||||
fn angular_db(&self) -> hannoy::Database<Cosine> {
|
||||
self.database.remap_data_type()
|
||||
}
|
||||
@ -1230,7 +1297,7 @@ pub const fn is_cuda_enabled() -> bool {
|
||||
cfg!(feature = "cuda")
|
||||
}
|
||||
|
||||
fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||
fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||
(0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id))
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user