mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-06 04:36:32 +00:00
Integrate arroy with conversion capabilities
This commit is contained in:
4
Cargo.lock
generated
4
Cargo.lock
generated
@ -444,9 +444,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.6.1"
|
version = "0.6.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
|
checksum = "8578a72223dfa13dfd9fc144d15260d134361789ebdea9b16e85a511edc73c7d"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
@ -87,7 +87,7 @@ rhai = { version = "1.22.2", features = [
|
|||||||
"no_time",
|
"no_time",
|
||||||
"sync",
|
"sync",
|
||||||
] }
|
] }
|
||||||
arroy = "0.6.1"
|
arroy = "0.6.3"
|
||||||
hannoy = "0.0.4"
|
hannoy = "0.0.4"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.41"
|
tracing = "0.1.41"
|
||||||
@ -111,7 +111,11 @@ utoipa = { version = "5.4.0", features = [
|
|||||||
"openapi_extensions",
|
"openapi_extensions",
|
||||||
] }
|
] }
|
||||||
lru = "0.14.0"
|
lru = "0.14.0"
|
||||||
twox-hash = { version = "2.1.1", default-features = false, features = ["std", "xxhash3_64", "xxhash64"] }
|
twox-hash = { version = "2.1.1", default-features = false, features = [
|
||||||
|
"std",
|
||||||
|
"xxhash3_64",
|
||||||
|
"xxhash64",
|
||||||
|
] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
mimalloc = { version = "0.1.47", default-features = false }
|
mimalloc = { version = "0.1.47", default-features = false }
|
||||||
|
@ -5,6 +5,7 @@ use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
|
|||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use enum_iterator::Sequence as _;
|
||||||
use indexmap::IndexMap;
|
use indexmap::IndexMap;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
@ -95,6 +96,14 @@ impl Progress {
|
|||||||
|
|
||||||
durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect()
|
durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: ideally we should expose the progress in a way that let arroy use it directly
|
||||||
|
pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) {
|
||||||
|
self.update_progress(progress.main);
|
||||||
|
if let Some(sub) = progress.sub {
|
||||||
|
self.update_progress(sub);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generate the names associated with the durations and push them.
|
/// Generate the names associated with the durations and push them.
|
||||||
@ -291,3 +300,45 @@ impl<T: steppe::Step> Step for Compat<T> {
|
|||||||
self.0.total().try_into().unwrap_or(u32::MAX)
|
self.0.total().try_into().unwrap_or(u32::MAX)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Step for arroy::MainStep {
|
||||||
|
fn name(&self) -> Cow<'static, str> {
|
||||||
|
match self {
|
||||||
|
arroy::MainStep::PreProcessingTheItems => "pre processing the items",
|
||||||
|
arroy::MainStep::WritingTheDescendantsAndMetadata => {
|
||||||
|
"writing the descendants and metadata"
|
||||||
|
}
|
||||||
|
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
|
||||||
|
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
|
||||||
|
arroy::MainStep::UpdatingTheTrees => "updating the trees",
|
||||||
|
arroy::MainStep::CreateNewTrees => "create new trees",
|
||||||
|
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
|
||||||
|
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
|
||||||
|
arroy::MainStep::WriteTheMetadata => "write the metadata",
|
||||||
|
arroy::MainStep::ConvertingHannoyToArroy => "converting hannoy to arroy",
|
||||||
|
}
|
||||||
|
.into()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn current(&self) -> u32 {
|
||||||
|
*self as u32
|
||||||
|
}
|
||||||
|
|
||||||
|
fn total(&self) -> u32 {
|
||||||
|
Self::CARDINALITY as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Step for arroy::SubStep {
|
||||||
|
fn name(&self) -> Cow<'static, str> {
|
||||||
|
self.unit.into()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn current(&self) -> u32 {
|
||||||
|
self.current.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn total(&self) -> u32 {
|
||||||
|
self.max
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -23,9 +23,9 @@ impl UpgradeIndex for Latest_V1_18_New_Hannoy {
|
|||||||
/// REMOVE THIS FILE, IMPLEMENT CONVERSION AS A SETTING CHANGE
|
/// REMOVE THIS FILE, IMPLEMENT CONVERSION AS A SETTING CHANGE
|
||||||
let quantized = config.config.quantized();
|
let quantized = config.config.quantized();
|
||||||
let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap();
|
let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap();
|
||||||
let vector_store =
|
let mut vector_store =
|
||||||
VectorStore::new(backend, index.vector_store, embedder_id, quantized);
|
VectorStore::new(backend, index.vector_store, embedder_id, quantized);
|
||||||
vector_store.convert_from_arroy(wtxn, progress.clone())?;
|
vector_store.change_backend(wtxn, progress.clone())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(false)
|
Ok(false)
|
||||||
|
@ -173,43 +173,93 @@ impl VectorStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn convert_from_arroy(&self, wtxn: &mut RwTxn, progress: Progress) -> crate::Result<()> {
|
pub fn change_backend(&mut self, wtxn: &mut RwTxn, progress: Progress) -> crate::Result<()> {
|
||||||
if self.quantized {
|
if self.backend == VectorStoreBackend::Arroy {
|
||||||
let dimensions = self
|
self.backend = VectorStoreBackend::Hannoy;
|
||||||
.arroy_readers(wtxn, self.arroy_quantized_db())
|
if self.quantized {
|
||||||
.next()
|
let dimensions = self
|
||||||
.transpose()?
|
.arroy_readers(wtxn, self.arroy_quantized_db())
|
||||||
.map(|reader| reader.dimensions());
|
.next()
|
||||||
|
.transpose()?
|
||||||
|
.map(|reader| reader.dimensions());
|
||||||
|
|
||||||
let Some(dimensions) = dimensions else { return Ok(()) };
|
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||||
|
|
||||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||||
let mut rng = rand::rngs::StdRng::from_entropy();
|
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||||
let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions);
|
let writer = hannoy::Writer::new(self.quantized_db(), index, dimensions);
|
||||||
let mut builder = writer.builder(&mut rng).progress(progress.clone());
|
let mut builder = writer.builder(&mut rng).progress(progress.clone());
|
||||||
builder.prepare_arroy_conversion(wtxn)?;
|
builder.prepare_arroy_conversion(wtxn)?;
|
||||||
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
let dimensions = self
|
||||||
|
.arroy_readers(wtxn, self.arroy_angular_db())
|
||||||
|
.next()
|
||||||
|
.transpose()?
|
||||||
|
.map(|reader| reader.dimensions());
|
||||||
|
|
||||||
|
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||||
|
|
||||||
|
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||||
|
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||||
|
let writer = hannoy::Writer::new(self.angular_db(), index, dimensions);
|
||||||
|
let mut builder = writer.builder(&mut rng).progress(progress.clone());
|
||||||
|
builder.prepare_arroy_conversion(wtxn)?;
|
||||||
|
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
|
||||||
} else {
|
} else {
|
||||||
let dimensions = self
|
self.backend = VectorStoreBackend::Arroy;
|
||||||
.arroy_readers(wtxn, self.arroy_angular_db())
|
|
||||||
.next()
|
|
||||||
.transpose()?
|
|
||||||
.map(|reader| reader.dimensions());
|
|
||||||
|
|
||||||
let Some(dimensions) = dimensions else { return Ok(()) };
|
if self.quantized {
|
||||||
|
/// FIXME: make sure that all of the functions in the vector store can work both in arroy and hannoy
|
||||||
|
/// (the existing implementation was assuming that only reads could happen in arroy)
|
||||||
|
/// bonus points for making it harder to forget switching on arroy on hannoy when modifying the code
|
||||||
|
let dimensions = self
|
||||||
|
.readers(wtxn, self.quantized_db())
|
||||||
|
.next()
|
||||||
|
.transpose()?
|
||||||
|
.map(|reader| reader.dimensions());
|
||||||
|
|
||||||
for index in vector_store_range_for_embedder(self.embedder_index) {
|
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||||
let mut rng = rand::rngs::StdRng::from_entropy();
|
|
||||||
let writer = hannoy::Writer::new(self.angular_db(), index, dimensions);
|
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||||
let mut builder = writer.builder(&mut rng).progress(progress.clone());
|
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||||
builder.prepare_arroy_conversion(wtxn)?;
|
let writer = arroy::Writer::new(self.arroy_quantized_db(), index, dimensions);
|
||||||
builder.build::<HANNOY_M, HANNOY_M0>(wtxn)?;
|
let mut builder = writer.builder(&mut rng);
|
||||||
|
let builder =
|
||||||
|
builder.progress(|step| progress.update_progress_from_arroy(step));
|
||||||
|
builder.prepare_hannoy_conversion(wtxn)?;
|
||||||
|
builder.build(wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
let dimensions = self
|
||||||
|
.readers(wtxn, self.angular_db())
|
||||||
|
.next()
|
||||||
|
.transpose()?
|
||||||
|
.map(|reader| reader.dimensions());
|
||||||
|
|
||||||
|
let Some(dimensions) = dimensions else { return Ok(()) };
|
||||||
|
|
||||||
|
for index in vector_store_range_for_embedder(self.embedder_index) {
|
||||||
|
let mut rng = rand::rngs::StdRng::from_entropy();
|
||||||
|
let writer = arroy::Writer::new(self.arroy_angular_db(), index, dimensions);
|
||||||
|
let mut builder = writer.builder(&mut rng);
|
||||||
|
let builder =
|
||||||
|
builder.progress(|step| progress.update_progress_from_arroy(step));
|
||||||
|
builder.prepare_hannoy_conversion(wtxn)?;
|
||||||
|
builder.build(wtxn)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user