mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-06 04:36:32 +00:00
First version with hannoy
This commit is contained in:
85
Cargo.lock
generated
85
Cargo.lock
generated
@ -442,28 +442,6 @@ version = "0.7.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "arroy"
|
|
||||||
version = "0.6.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
|
|
||||||
dependencies = [
|
|
||||||
"bytemuck",
|
|
||||||
"byteorder",
|
|
||||||
"enum-iterator",
|
|
||||||
"heed",
|
|
||||||
"memmap2",
|
|
||||||
"nohash",
|
|
||||||
"ordered-float 4.6.0",
|
|
||||||
"page_size",
|
|
||||||
"rand 0.8.5",
|
|
||||||
"rayon",
|
|
||||||
"roaring",
|
|
||||||
"tempfile",
|
|
||||||
"thiserror 2.0.12",
|
|
||||||
"tracing",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "assert-json-diff"
|
name = "assert-json-diff"
|
||||||
version = "2.0.2"
|
version = "2.0.2"
|
||||||
@ -2600,6 +2578,32 @@ dependencies = [
|
|||||||
"rand_distr",
|
"rand_distr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hannoy"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "git+https://github.com/nnethercott/hannoy?branch=incremental-indexing#364f611b10642ad34a3466b80dc687d1114bda4f"
|
||||||
|
dependencies = [
|
||||||
|
"bytemuck",
|
||||||
|
"byteorder",
|
||||||
|
"enum-iterator",
|
||||||
|
"hashbrown 0.15.4",
|
||||||
|
"heed",
|
||||||
|
"memmap2",
|
||||||
|
"min-max-heap",
|
||||||
|
"nohash",
|
||||||
|
"ordered-float 5.0.0",
|
||||||
|
"page_size",
|
||||||
|
"papaya",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"rayon",
|
||||||
|
"roaring",
|
||||||
|
"slice-group-by",
|
||||||
|
"tempfile",
|
||||||
|
"thiserror 2.0.12",
|
||||||
|
"tinyvec",
|
||||||
|
"tracing",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hash32"
|
name = "hash32"
|
||||||
version = "0.3.1"
|
version = "0.3.1"
|
||||||
@ -3921,7 +3925,6 @@ name = "milli"
|
|||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"allocator-api2 0.3.0",
|
"allocator-api2 0.3.0",
|
||||||
"arroy",
|
|
||||||
"bbqueue",
|
"bbqueue",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bimap",
|
"bimap",
|
||||||
@ -3949,6 +3952,7 @@ dependencies = [
|
|||||||
"fxhash",
|
"fxhash",
|
||||||
"geoutils",
|
"geoutils",
|
||||||
"grenad",
|
"grenad",
|
||||||
|
"hannoy",
|
||||||
"hashbrown 0.15.4",
|
"hashbrown 0.15.4",
|
||||||
"heed",
|
"heed",
|
||||||
"hf-hub",
|
"hf-hub",
|
||||||
@ -4018,6 +4022,12 @@ dependencies = [
|
|||||||
"unicase",
|
"unicase",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "min-max-heap"
|
||||||
|
version = "1.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2687e6cf9c00f48e9284cf9fd15f2ef341d03cc7743abf9df4c5f07fdee50b18"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "minimal-lexical"
|
name = "minimal-lexical"
|
||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
@ -4358,15 +4368,6 @@ dependencies = [
|
|||||||
"num-traits",
|
"num-traits",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ordered-float"
|
|
||||||
version = "4.6.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951"
|
|
||||||
dependencies = [
|
|
||||||
"num-traits",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ordered-float"
|
name = "ordered-float"
|
||||||
version = "5.0.0"
|
version = "5.0.0"
|
||||||
@ -4398,6 +4399,16 @@ dependencies = [
|
|||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "papaya"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f"
|
||||||
|
dependencies = [
|
||||||
|
"equivalent",
|
||||||
|
"seize",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking_lot"
|
name = "parking_lot"
|
||||||
version = "0.12.4"
|
version = "0.12.4"
|
||||||
@ -5449,6 +5460,16 @@ dependencies = [
|
|||||||
"time",
|
"time",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "seize"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys 0.52.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "semver"
|
name = "semver"
|
||||||
version = "1.0.26"
|
version = "1.0.26"
|
||||||
|
@ -143,10 +143,10 @@ impl IndexStats {
|
|||||||
///
|
///
|
||||||
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
|
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
|
||||||
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
|
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
|
||||||
let arroy_stats = index.arroy_stats(rtxn)?;
|
let hannoy_stats = index.hannoy_stats(rtxn)?;
|
||||||
Ok(IndexStats {
|
Ok(IndexStats {
|
||||||
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
|
number_of_embeddings: Some(hannoy_stats.number_of_embeddings),
|
||||||
number_of_embedded_documents: Some(arroy_stats.documents.len()),
|
number_of_embedded_documents: Some(hannoy_stats.documents.len()),
|
||||||
documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(),
|
documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(),
|
||||||
number_of_documents: None,
|
number_of_documents: None,
|
||||||
database_size: index.on_disk_size()?,
|
database_size: index.on_disk_size()?,
|
||||||
|
@ -320,7 +320,7 @@ async fn binary_quantize_clear_documents() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
// Make sure the hannoy DB has been cleared
|
||||||
let (documents, _code) =
|
let (documents, _code) =
|
||||||
index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await;
|
index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await;
|
||||||
snapshot!(documents, @r###"
|
snapshot!(documents, @r###"
|
||||||
|
@ -682,7 +682,7 @@ async fn clear_documents() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
// Make sure the hannoy DB has been cleared
|
||||||
let (documents, _code) =
|
let (documents, _code) =
|
||||||
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await;
|
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await;
|
||||||
snapshot!(documents, @r###"
|
snapshot!(documents, @r###"
|
||||||
|
@ -243,7 +243,7 @@ async fn reset_embedder_documents() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
// Make sure the hannoy DB has been cleared
|
||||||
let (documents, _code) =
|
let (documents, _code) =
|
||||||
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await;
|
index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await;
|
||||||
snapshot!(json_string!(documents), @r###"
|
snapshot!(json_string!(documents), @r###"
|
||||||
|
@ -141,8 +141,8 @@ enum Command {
|
|||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
enum IndexPart {
|
enum IndexPart {
|
||||||
/// Will make the arroy index hot.
|
/// Will make the hannoy index hot.
|
||||||
Arroy,
|
Hannoy,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> anyhow::Result<()> {
|
fn main() -> anyhow::Result<()> {
|
||||||
@ -648,12 +648,12 @@ fn hair_dryer(
|
|||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
for part in index_parts {
|
for part in index_parts {
|
||||||
match part {
|
match part {
|
||||||
IndexPart::Arroy => {
|
IndexPart::Hannoy => {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let total = index.vector_arroy.len(&rtxn)?;
|
let total = index.vector_hannoy.len(&rtxn)?;
|
||||||
eprintln!("Hair drying arroy for {uid}...");
|
eprintln!("Hair drying hannoy for {uid}...");
|
||||||
for (i, result) in index
|
for (i, result) in index
|
||||||
.vector_arroy
|
.vector_hannoy
|
||||||
.remap_types::<Bytes, Bytes>()
|
.remap_types::<Bytes, Bytes>()
|
||||||
.iter(&rtxn)?
|
.iter(&rtxn)?
|
||||||
.enumerate()
|
.enumerate()
|
||||||
|
@ -68,7 +68,7 @@ pub fn v1_10_to_v1_11(
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
let index_read_database =
|
let index_read_database =
|
||||||
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY)
|
try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_HANNOY)
|
||||||
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||||
|
|
||||||
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
let mut index_wtxn = index_env.write_txn().with_context(|| {
|
||||||
@ -79,15 +79,16 @@ pub fn v1_10_to_v1_11(
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
let index_write_database =
|
let index_write_database =
|
||||||
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY)
|
try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_HANNOY)
|
||||||
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
.with_context(|| format!("while updating date format for index `{uid}`"))?;
|
||||||
|
|
||||||
meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5(
|
// meilisearch_types::milli::hannoy::upgrade::cosine_from_0_4_to_0_5(
|
||||||
&index_rtxn,
|
// &index_rtxn,
|
||||||
index_read_database.remap_types(),
|
// index_read_database.remap_types(),
|
||||||
&mut index_wtxn,
|
// &mut index_wtxn,
|
||||||
index_write_database.remap_types(),
|
// index_write_database.remap_types(),
|
||||||
)?;
|
// )?;
|
||||||
|
unimplemented!("Hannoy doesn't support upgrading");
|
||||||
|
|
||||||
index_wtxn.commit()?;
|
index_wtxn.commit()?;
|
||||||
}
|
}
|
||||||
|
@ -87,7 +87,7 @@ rhai = { version = "1.22.2", features = [
|
|||||||
"no_time",
|
"no_time",
|
||||||
"sync",
|
"sync",
|
||||||
] }
|
] }
|
||||||
arroy = "0.6.1"
|
hannoy = { git = "https://github.com/nnethercott/hannoy", branch = "incremental-indexing" }
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.41"
|
tracing = "0.1.41"
|
||||||
ureq = { version = "2.12.1", features = ["json"] }
|
ureq = { version = "2.12.1", features = ["json"] }
|
||||||
|
@ -76,7 +76,7 @@ pub enum InternalError {
|
|||||||
#[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")]
|
#[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")]
|
||||||
CannotUpgradeToVersion(u32, u32, u32),
|
CannotUpgradeToVersion(u32, u32, u32),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
ArroyError(#[from] arroy::Error),
|
HannoyError(#[from] hannoy::Error),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||||
}
|
}
|
||||||
@ -405,23 +405,24 @@ impl From<crate::vector::Error> for Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<arroy::Error> for Error {
|
impl From<hannoy::Error> for Error {
|
||||||
fn from(value: arroy::Error) -> Self {
|
fn from(value: hannoy::Error) -> Self {
|
||||||
match value {
|
match value {
|
||||||
arroy::Error::Heed(heed) => heed.into(),
|
hannoy::Error::Heed(heed) => heed.into(),
|
||||||
arroy::Error::Io(io) => io.into(),
|
hannoy::Error::Io(io) => io.into(),
|
||||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
hannoy::Error::InvalidVecDimension { expected, received } => {
|
||||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||||
}
|
}
|
||||||
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
hannoy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
||||||
arroy::Error::DatabaseFull
|
hannoy::Error::DatabaseFull
|
||||||
| arroy::Error::InvalidItemAppend
|
| hannoy::Error::InvalidItemAppend
|
||||||
| arroy::Error::UnmatchingDistance { .. }
|
| hannoy::Error::UnmatchingDistance { .. }
|
||||||
| arroy::Error::NeedBuild(_)
|
| hannoy::Error::NeedBuild(_)
|
||||||
| arroy::Error::MissingKey { .. }
|
| hannoy::Error::MissingKey { .. }
|
||||||
| arroy::Error::MissingMetadata(_)
|
| hannoy::Error::MissingMetadata(_)
|
||||||
| arroy::Error::CannotDecodeKeyMode { .. } => {
|
| hannoy::Error::UnknownVersion { .. }
|
||||||
Error::InternalError(InternalError::ArroyError(value))
|
| hannoy::Error::CannotDecodeKeyMode { .. } => {
|
||||||
|
Error::InternalError(InternalError::HannoyError(value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,7 @@ use crate::prompt::PromptData;
|
|||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::new::StdResult;
|
use crate::update::new::StdResult;
|
||||||
use crate::vector::db::IndexEmbeddingConfigs;
|
use crate::vector::db::IndexEmbeddingConfigs;
|
||||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding};
|
use crate::vector::{Embedding, HannoyStats, HannoyWrapper};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
||||||
@ -113,7 +113,7 @@ pub mod db_name {
|
|||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
|
||||||
pub const VECTOR_ARROY: &str = "vector-arroy";
|
pub const VECTOR_HANNOY: &str = "vector-hannoy";
|
||||||
pub const DOCUMENTS: &str = "documents";
|
pub const DOCUMENTS: &str = "documents";
|
||||||
}
|
}
|
||||||
const NUMBER_OF_DBS: u32 = 25;
|
const NUMBER_OF_DBS: u32 = 25;
|
||||||
@ -177,10 +177,10 @@ pub struct Index {
|
|||||||
/// Maps the document id, the facet field id and the strings.
|
/// Maps the document id, the facet field id and the strings.
|
||||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||||
|
|
||||||
/// Maps an embedder name to its id in the arroy store.
|
/// Maps an embedder name to its id in the hannoy store.
|
||||||
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
|
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
|
||||||
/// Vector store based on arroy™.
|
/// Vector store based on hannoy™.
|
||||||
pub vector_arroy: arroy::Database<Unspecified>,
|
pub vector_hannoy: hannoy::Database<Unspecified>,
|
||||||
|
|
||||||
/// Maps the document id to the document as an obkv store.
|
/// Maps the document id to the document as an obkv store.
|
||||||
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
pub(crate) documents: Database<BEU32, ObkvCodec>,
|
||||||
@ -237,7 +237,7 @@ impl Index {
|
|||||||
// vector stuff
|
// vector stuff
|
||||||
let embedder_category_id =
|
let embedder_category_id =
|
||||||
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
|
||||||
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
|
let vector_hannoy = env.create_database(&mut wtxn, Some(VECTOR_HANNOY))?;
|
||||||
|
|
||||||
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
|
||||||
|
|
||||||
@ -264,7 +264,7 @@ impl Index {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
vector_arroy,
|
vector_hannoy,
|
||||||
embedder_category_id,
|
embedder_category_id,
|
||||||
documents,
|
documents,
|
||||||
};
|
};
|
||||||
@ -1771,8 +1771,8 @@ impl Index {
|
|||||||
let embedders = self.embedding_configs();
|
let embedders = self.embedding_configs();
|
||||||
for config in embedders.embedding_configs(rtxn)? {
|
for config in embedders.embedding_configs(rtxn)? {
|
||||||
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
|
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
|
||||||
let reader = ArroyWrapper::new(
|
let reader = HannoyWrapper::new(
|
||||||
self.vector_arroy,
|
self.vector_hannoy,
|
||||||
embedder_info.embedder_id,
|
embedder_info.embedder_id,
|
||||||
config.config.quantized(),
|
config.config.quantized(),
|
||||||
);
|
);
|
||||||
@ -1790,13 +1790,13 @@ impl Index {
|
|||||||
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
|
pub fn hannoy_stats(&self, rtxn: &RoTxn<'_>) -> Result<HannoyStats> {
|
||||||
let mut stats = ArroyStats::default();
|
let mut stats = HannoyStats::default();
|
||||||
let embedding_configs = self.embedding_configs();
|
let embedding_configs = self.embedding_configs();
|
||||||
for config in embedding_configs.embedding_configs(rtxn)? {
|
for config in embedding_configs.embedding_configs(rtxn)? {
|
||||||
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
|
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
|
||||||
let reader =
|
let reader =
|
||||||
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
HannoyWrapper::new(self.vector_hannoy, embedder_id, config.config.quantized());
|
||||||
reader.aggregate_stats(rtxn, &mut stats)?;
|
reader.aggregate_stats(rtxn, &mut stats)?;
|
||||||
}
|
}
|
||||||
Ok(stats)
|
Ok(stats)
|
||||||
@ -1840,7 +1840,7 @@ impl Index {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
vector_arroy,
|
vector_hannoy,
|
||||||
embedder_category_id,
|
embedder_category_id,
|
||||||
documents,
|
documents,
|
||||||
} = self;
|
} = self;
|
||||||
@ -1911,7 +1911,7 @@ impl Index {
|
|||||||
"field_id_docid_facet_strings",
|
"field_id_docid_facet_strings",
|
||||||
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
|
field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
|
||||||
);
|
);
|
||||||
sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?);
|
sizes.insert("vector_hannoy", vector_hannoy.stat(rtxn).map(compute_size)?);
|
||||||
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
|
sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
|
||||||
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);
|
sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ pub use search::new::{
|
|||||||
};
|
};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||||
pub use {arroy, charabia as tokenizer, heed, rhai};
|
pub use {charabia as tokenizer, hannoy, heed, rhai};
|
||||||
|
|
||||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||||
pub use self::attribute_patterns::{AttributePatterns, PatternMatch};
|
pub use self::attribute_patterns::{AttributePatterns, PatternMatch};
|
||||||
|
@ -98,12 +98,12 @@ impl Progress {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// TODO: ideally we should expose the progress in a way that let arroy use it directly
|
// TODO: ideally we should expose the progress in a way that let arroy use it directly
|
||||||
pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) {
|
// pub(crate) fn update_progress_from_hannoy(&self, progress: hannoy::WriterProgress) {
|
||||||
self.update_progress(progress.main);
|
// self.update_progress(progress.main);
|
||||||
if let Some(sub) = progress.sub {
|
// if let Some(sub) = progress.sub {
|
||||||
self.update_progress(sub);
|
// self.update_progress(sub);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generate the names associated with the durations and push them.
|
/// Generate the names associated with the durations and push them.
|
||||||
@ -277,43 +277,43 @@ impl<U: Send + Sync + 'static> Step for VariableNameStep<U> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Step for arroy::MainStep {
|
// impl Step for hannoy::MainStep {
|
||||||
fn name(&self) -> Cow<'static, str> {
|
// fn name(&self) -> Cow<'static, str> {
|
||||||
match self {
|
// match self {
|
||||||
arroy::MainStep::PreProcessingTheItems => "pre processing the items",
|
// hannoy::MainStep::PreProcessingTheItems => "pre processing the items",
|
||||||
arroy::MainStep::WritingTheDescendantsAndMetadata => {
|
// hannoy::MainStep::WritingTheDescendantsAndMetadata => {
|
||||||
"writing the descendants and metadata"
|
// "writing the descendants and metadata"
|
||||||
}
|
// }
|
||||||
arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
|
// hannoy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
|
||||||
arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
|
// hannoy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
|
||||||
arroy::MainStep::UpdatingTheTrees => "updating the trees",
|
// hannoy::MainStep::UpdatingTheTrees => "updating the trees",
|
||||||
arroy::MainStep::CreateNewTrees => "create new trees",
|
// hannoy::MainStep::CreateNewTrees => "create new trees",
|
||||||
arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
|
// hannoy::MainStep::WritingNodesToDatabase => "writing nodes to database",
|
||||||
arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
|
// hannoy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
|
||||||
arroy::MainStep::WriteTheMetadata => "write the metadata",
|
// hannoy::MainStep::WriteTheMetadata => "write the metadata",
|
||||||
}
|
// }
|
||||||
.into()
|
// .into()
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn current(&self) -> u32 {
|
// fn current(&self) -> u32 {
|
||||||
*self as u32
|
// *self as u32
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn total(&self) -> u32 {
|
// fn total(&self) -> u32 {
|
||||||
Self::CARDINALITY as u32
|
// Self::CARDINALITY as u32
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
impl Step for arroy::SubStep {
|
// impl Step for hannoy::SubStep {
|
||||||
fn name(&self) -> Cow<'static, str> {
|
// fn name(&self) -> Cow<'static, str> {
|
||||||
self.unit.into()
|
// self.unit.into()
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn current(&self) -> u32 {
|
// fn current(&self) -> u32 {
|
||||||
self.current.load(Ordering::Relaxed)
|
// self.current.load(Ordering::Relaxed)
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn total(&self) -> u32 {
|
// fn total(&self) -> u32 {
|
||||||
self.max
|
// self.max
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
|
|||||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||||
use super::VectorStoreStats;
|
use super::VectorStoreStats;
|
||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::vector::{ArroyWrapper, DistributionShift, Embedder};
|
use crate::vector::{DistributionShift, Embedder, HannoyWrapper};
|
||||||
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
||||||
|
|
||||||
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||||
@ -56,7 +56,8 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
|||||||
let target = &self.target;
|
let target = &self.target;
|
||||||
|
|
||||||
let before = Instant::now();
|
let before = Instant::now();
|
||||||
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized);
|
let reader =
|
||||||
|
HannoyWrapper::new(ctx.index.vector_hannoy, self.embedder_index, self.quantized);
|
||||||
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
||||||
self.cached_sorted_docids = results.into_iter();
|
self.cached_sorted_docids = results.into_iter();
|
||||||
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {
|
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {
|
||||||
|
@ -3,7 +3,7 @@ use std::sync::Arc;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::vector::{ArroyWrapper, Embedder};
|
use crate::vector::{Embedder, HannoyWrapper};
|
||||||
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
||||||
|
|
||||||
pub struct Similar<'a> {
|
pub struct Similar<'a> {
|
||||||
@ -72,7 +72,7 @@ impl<'a> Similar<'a> {
|
|||||||
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
|
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
|
let reader = HannoyWrapper::new(self.index.vector_hannoy, embedder_index, self.quantized);
|
||||||
let results = reader.nns_by_item(
|
let results = reader.nns_by_item(
|
||||||
self.rtxn,
|
self.rtxn,
|
||||||
self.id,
|
self.id,
|
||||||
|
@ -45,7 +45,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
field_id_docid_facet_f64s,
|
field_id_docid_facet_f64s,
|
||||||
field_id_docid_facet_strings,
|
field_id_docid_facet_strings,
|
||||||
vector_arroy,
|
vector_hannoy,
|
||||||
embedder_category_id: _,
|
embedder_category_id: _,
|
||||||
documents,
|
documents,
|
||||||
} = self.index;
|
} = self.index;
|
||||||
@ -88,7 +88,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
|||||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||||
// vector
|
// vector
|
||||||
vector_arroy.clear(self.wtxn)?;
|
vector_hannoy.clear(self.wtxn)?;
|
||||||
|
|
||||||
documents.clear(self.wtxn)?;
|
documents.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ use crate::update::{
|
|||||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::vector::db::EmbedderInfo;
|
use crate::vector::db::EmbedderInfo;
|
||||||
use crate::vector::{ArroyWrapper, RuntimeEmbedders};
|
use crate::vector::{HannoyWrapper, RuntimeEmbedders};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
@ -494,7 +494,7 @@ where
|
|||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
let reader =
|
let reader =
|
||||||
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
HannoyWrapper::new(self.index.vector_hannoy, index, action.was_quantized);
|
||||||
let Some(dim) = reader.dimensions(self.wtxn)? else {
|
let Some(dim) = reader.dimensions(self.wtxn)? else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
@ -504,7 +504,7 @@ where
|
|||||||
|
|
||||||
for (embedder_name, dimension) in dimension {
|
for (embedder_name, dimension) in dimension {
|
||||||
let wtxn = &mut *self.wtxn;
|
let wtxn = &mut *self.wtxn;
|
||||||
let vector_arroy = self.index.vector_arroy;
|
let vector_hannoy = self.index.vector_hannoy;
|
||||||
let cancel = &self.should_abort;
|
let cancel = &self.should_abort;
|
||||||
|
|
||||||
let embedder_index =
|
let embedder_index =
|
||||||
@ -523,7 +523,7 @@ where
|
|||||||
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
|
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
|
||||||
|
|
||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
let mut writer = HannoyWrapper::new(vector_hannoy, embedder_index, was_quantized);
|
||||||
writer.build_and_quantize(
|
writer.build_and_quantize(
|
||||||
wtxn,
|
wtxn,
|
||||||
// In the settings we don't have any progress to share
|
// In the settings we don't have any progress to share
|
||||||
|
@ -32,7 +32,7 @@ use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
|||||||
use crate::update::{AvailableIds, UpdateIndexingStep};
|
use crate::update::{AvailableIds, UpdateIndexingStep};
|
||||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
|
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
|
||||||
use crate::vector::ArroyWrapper;
|
use crate::vector::HannoyWrapper;
|
||||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
||||||
|
|
||||||
pub struct TransformOutput {
|
pub struct TransformOutput {
|
||||||
@ -834,15 +834,15 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff
|
let readers: BTreeMap<&str, (HannoyWrapper, &RoaringBitmap)> = settings_diff
|
||||||
.embedding_config_updates
|
.embedding_config_updates
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(name, action)| {
|
.filter_map(|(name, action)| {
|
||||||
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
|
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
|
||||||
action.write_back()
|
action.write_back()
|
||||||
{
|
{
|
||||||
let reader = ArroyWrapper::new(
|
let reader = HannoyWrapper::new(
|
||||||
self.index.vector_arroy,
|
self.index.vector_hannoy,
|
||||||
*embedder_id,
|
*embedder_id,
|
||||||
action.was_quantized,
|
action.was_quantized,
|
||||||
);
|
);
|
||||||
@ -884,7 +884,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
let injected_vectors: std::result::Result<
|
let injected_vectors: std::result::Result<
|
||||||
serde_json::Map<String, serde_json::Value>,
|
serde_json::Map<String, serde_json::Value>,
|
||||||
arroy::Error,
|
hannoy::Error,
|
||||||
> = readers
|
> = readers
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|(name, (reader, user_provided))| {
|
.filter_map(|(name, (reader, user_provided))| {
|
||||||
@ -949,9 +949,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
else {
|
else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let arroy =
|
let hannoy =
|
||||||
ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized);
|
HannoyWrapper::new(self.index.vector_hannoy, infos.embedder_id, was_quantized);
|
||||||
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
let Some(dimensions) = hannoy.dimensions(wtxn)? else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
for fragment_id in fragment_ids {
|
for fragment_id in fragment_ids {
|
||||||
@ -959,17 +959,17 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
if infos.embedding_status.user_provided_docids().is_empty() {
|
if infos.embedding_status.user_provided_docids().is_empty() {
|
||||||
// no user provided: clear store
|
// no user provided: clear store
|
||||||
arroy.clear_store(wtxn, *fragment_id, dimensions)?;
|
hannoy.clear_store(wtxn, *fragment_id, dimensions)?;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// some user provided, remove only the ids that are not user provided
|
// some user provided, remove only the ids that are not user provided
|
||||||
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| {
|
let to_delete = hannoy.items_in_store(wtxn, *fragment_id, |items| {
|
||||||
items - infos.embedding_status.user_provided_docids()
|
items - infos.embedding_status.user_provided_docids()
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
for to_delete in to_delete {
|
for to_delete in to_delete {
|
||||||
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
|
hannoy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,7 @@ use crate::update::index_documents::helpers::{
|
|||||||
};
|
};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
||||||
use crate::vector::ArroyWrapper;
|
use crate::vector::HannoyWrapper;
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||||
Result, SerializationError, U8StrStrCodec,
|
Result, SerializationError, U8StrStrCodec,
|
||||||
@ -677,7 +677,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
.get(&embedder_name)
|
.get(&embedder_name)
|
||||||
.is_some_and(|conf| conf.is_quantized);
|
.is_some_and(|conf| conf.is_quantized);
|
||||||
// FIXME: allow customizing distance
|
// FIXME: allow customizing distance
|
||||||
let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized);
|
let writer =
|
||||||
|
HannoyWrapper::new(index.vector_hannoy, infos.embedder_id, binary_quantized);
|
||||||
|
|
||||||
// remove vectors for docids we want them removed
|
// remove vectors for docids we want them removed
|
||||||
let merger = remove_vectors_builder.build();
|
let merger = remove_vectors_builder.build();
|
||||||
|
@ -255,9 +255,9 @@ impl<'a> From<FrameGrantR<'a>> for FrameWithHeader<'a> {
|
|||||||
#[repr(u8)]
|
#[repr(u8)]
|
||||||
pub enum EntryHeader {
|
pub enum EntryHeader {
|
||||||
DbOperation(DbOperation),
|
DbOperation(DbOperation),
|
||||||
ArroyDeleteVector(ArroyDeleteVector),
|
HannoyDeleteVector(HannoyDeleteVector),
|
||||||
ArroySetVectors(ArroySetVectors),
|
HannoySetVectors(HannoySetVectors),
|
||||||
ArroySetVector(ArroySetVector),
|
HannoySetVector(HannoySetVector),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EntryHeader {
|
impl EntryHeader {
|
||||||
@ -268,9 +268,9 @@ impl EntryHeader {
|
|||||||
const fn variant_id(&self) -> u8 {
|
const fn variant_id(&self) -> u8 {
|
||||||
match self {
|
match self {
|
||||||
EntryHeader::DbOperation(_) => 0,
|
EntryHeader::DbOperation(_) => 0,
|
||||||
EntryHeader::ArroyDeleteVector(_) => 1,
|
EntryHeader::HannoyDeleteVector(_) => 1,
|
||||||
EntryHeader::ArroySetVectors(_) => 2,
|
EntryHeader::HannoySetVectors(_) => 2,
|
||||||
EntryHeader::ArroySetVector(_) => 3,
|
EntryHeader::HannoySetVector(_) => 3,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -286,26 +286,26 @@ impl EntryHeader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const fn total_delete_vector_size() -> usize {
|
const fn total_delete_vector_size() -> usize {
|
||||||
Self::variant_size() + mem::size_of::<ArroyDeleteVector>()
|
Self::variant_size() + mem::size_of::<HannoyDeleteVector>()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The `dimensions` corresponds to the number of `f32` in the embedding.
|
/// The `dimensions` corresponds to the number of `f32` in the embedding.
|
||||||
fn total_set_vectors_size(count: usize, dimensions: usize) -> usize {
|
fn total_set_vectors_size(count: usize, dimensions: usize) -> usize {
|
||||||
let embedding_size = dimensions * mem::size_of::<f32>();
|
let embedding_size = dimensions * mem::size_of::<f32>();
|
||||||
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count
|
Self::variant_size() + mem::size_of::<HannoySetVectors>() + embedding_size * count
|
||||||
}
|
}
|
||||||
|
|
||||||
fn total_set_vector_size(dimensions: usize) -> usize {
|
fn total_set_vector_size(dimensions: usize) -> usize {
|
||||||
let embedding_size = dimensions * mem::size_of::<f32>();
|
let embedding_size = dimensions * mem::size_of::<f32>();
|
||||||
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size
|
Self::variant_size() + mem::size_of::<HannoySetVector>() + embedding_size
|
||||||
}
|
}
|
||||||
|
|
||||||
fn header_size(&self) -> usize {
|
fn header_size(&self) -> usize {
|
||||||
let payload_size = match self {
|
let payload_size = match self {
|
||||||
EntryHeader::DbOperation(op) => mem::size_of_val(op),
|
EntryHeader::DbOperation(op) => mem::size_of_val(op),
|
||||||
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv),
|
EntryHeader::HannoyDeleteVector(adv) => mem::size_of_val(adv),
|
||||||
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs),
|
EntryHeader::HannoySetVectors(asvs) => mem::size_of_val(asvs),
|
||||||
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv),
|
EntryHeader::HannoySetVector(asv) => mem::size_of_val(asv),
|
||||||
};
|
};
|
||||||
Self::variant_size() + payload_size
|
Self::variant_size() + payload_size
|
||||||
}
|
}
|
||||||
@ -319,19 +319,19 @@ impl EntryHeader {
|
|||||||
EntryHeader::DbOperation(header)
|
EntryHeader::DbOperation(header)
|
||||||
}
|
}
|
||||||
1 => {
|
1 => {
|
||||||
let header_bytes = &remaining[..mem::size_of::<ArroyDeleteVector>()];
|
let header_bytes = &remaining[..mem::size_of::<HannoyDeleteVector>()];
|
||||||
let header = checked::pod_read_unaligned(header_bytes);
|
let header = checked::pod_read_unaligned(header_bytes);
|
||||||
EntryHeader::ArroyDeleteVector(header)
|
EntryHeader::HannoyDeleteVector(header)
|
||||||
}
|
}
|
||||||
2 => {
|
2 => {
|
||||||
let header_bytes = &remaining[..mem::size_of::<ArroySetVectors>()];
|
let header_bytes = &remaining[..mem::size_of::<HannoySetVectors>()];
|
||||||
let header = checked::pod_read_unaligned(header_bytes);
|
let header = checked::pod_read_unaligned(header_bytes);
|
||||||
EntryHeader::ArroySetVectors(header)
|
EntryHeader::HannoySetVectors(header)
|
||||||
}
|
}
|
||||||
3 => {
|
3 => {
|
||||||
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()];
|
let header_bytes = &remaining[..mem::size_of::<HannoySetVector>()];
|
||||||
let header = checked::pod_read_unaligned(header_bytes);
|
let header = checked::pod_read_unaligned(header_bytes);
|
||||||
EntryHeader::ArroySetVector(header)
|
EntryHeader::HannoySetVector(header)
|
||||||
}
|
}
|
||||||
id => panic!("invalid variant id: {id}"),
|
id => panic!("invalid variant id: {id}"),
|
||||||
}
|
}
|
||||||
@ -341,9 +341,9 @@ impl EntryHeader {
|
|||||||
let (first, remaining) = header_bytes.split_first_mut().unwrap();
|
let (first, remaining) = header_bytes.split_first_mut().unwrap();
|
||||||
let payload_bytes = match self {
|
let payload_bytes = match self {
|
||||||
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
|
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
|
||||||
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv),
|
EntryHeader::HannoyDeleteVector(adv) => bytemuck::bytes_of(adv),
|
||||||
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs),
|
EntryHeader::HannoySetVectors(asvs) => bytemuck::bytes_of(asvs),
|
||||||
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv),
|
EntryHeader::HannoySetVector(asv) => bytemuck::bytes_of(asv),
|
||||||
};
|
};
|
||||||
*first = self.variant_id();
|
*first = self.variant_id();
|
||||||
remaining.copy_from_slice(payload_bytes);
|
remaining.copy_from_slice(payload_bytes);
|
||||||
@ -378,7 +378,7 @@ impl DbOperation {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
pub struct ArroyDeleteVector {
|
pub struct HannoyDeleteVector {
|
||||||
pub docid: DocumentId,
|
pub docid: DocumentId,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -386,13 +386,13 @@ pub struct ArroyDeleteVector {
|
|||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
/// The embeddings are in the remaining space and represents
|
/// The embeddings are in the remaining space and represents
|
||||||
/// non-aligned [f32] each with dimensions f32s.
|
/// non-aligned [f32] each with dimensions f32s.
|
||||||
pub struct ArroySetVectors {
|
pub struct HannoySetVectors {
|
||||||
pub docid: DocumentId,
|
pub docid: DocumentId,
|
||||||
pub embedder_id: u8,
|
pub embedder_id: u8,
|
||||||
_padding: [u8; 3],
|
_padding: [u8; 3],
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArroySetVectors {
|
impl HannoySetVectors {
|
||||||
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
||||||
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
||||||
&frame[skip..]
|
&frame[skip..]
|
||||||
@ -416,14 +416,14 @@ impl ArroySetVectors {
|
|||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
/// The embeddings are in the remaining space and represents
|
/// The embeddings are in the remaining space and represents
|
||||||
/// non-aligned [f32] each with dimensions f32s.
|
/// non-aligned [f32] each with dimensions f32s.
|
||||||
pub struct ArroySetVector {
|
pub struct HannoySetVector {
|
||||||
pub docid: DocumentId,
|
pub docid: DocumentId,
|
||||||
pub embedder_id: u8,
|
pub embedder_id: u8,
|
||||||
pub extractor_id: u8,
|
pub extractor_id: u8,
|
||||||
_padding: [u8; 2],
|
_padding: [u8; 2],
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArroySetVector {
|
impl HannoySetVector {
|
||||||
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
||||||
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
||||||
&frame[skip..]
|
&frame[skip..]
|
||||||
@ -553,7 +553,7 @@ impl<'b> ExtractorBbqueueSender<'b> {
|
|||||||
let refcell = self.producers.get().unwrap();
|
let refcell = self.producers.get().unwrap();
|
||||||
let mut producer = refcell.0.borrow_mut_or_yield();
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
||||||
|
|
||||||
let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid });
|
let payload_header = EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid });
|
||||||
let total_length = EntryHeader::total_delete_vector_size();
|
let total_length = EntryHeader::total_delete_vector_size();
|
||||||
if total_length > max_grant {
|
if total_length > max_grant {
|
||||||
panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)");
|
panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)");
|
||||||
@ -589,8 +589,8 @@ impl<'b> ExtractorBbqueueSender<'b> {
|
|||||||
// to zero to allocate no extra space at all
|
// to zero to allocate no extra space at all
|
||||||
let dimensions = embeddings.first().map_or(0, |emb| emb.len());
|
let dimensions = embeddings.first().map_or(0, |emb| emb.len());
|
||||||
|
|
||||||
let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] };
|
let hannoy_set_vector = HannoySetVectors { docid, embedder_id, _padding: [0; 3] };
|
||||||
let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector);
|
let payload_header = EntryHeader::HannoySetVectors(hannoy_set_vector);
|
||||||
let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions);
|
let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions);
|
||||||
if total_length > max_grant {
|
if total_length > max_grant {
|
||||||
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||||
@ -650,9 +650,9 @@ impl<'b> ExtractorBbqueueSender<'b> {
|
|||||||
// to zero to allocate no extra space at all
|
// to zero to allocate no extra space at all
|
||||||
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
|
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
|
||||||
|
|
||||||
let arroy_set_vector =
|
let hannoy_set_vector =
|
||||||
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
|
HannoySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
|
||||||
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector);
|
let payload_header = EntryHeader::HannoySetVector(hannoy_set_vector);
|
||||||
let total_length = EntryHeader::total_set_vector_size(dimensions);
|
let total_length = EntryHeader::total_set_vector_size(dimensions);
|
||||||
if total_length > max_grant {
|
if total_length > max_grant {
|
||||||
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||||
|
@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
|
|||||||
use crate::update::settings::SettingsDelta;
|
use crate::update::settings::SettingsDelta;
|
||||||
use crate::update::GrenadParameters;
|
use crate::update::GrenadParameters;
|
||||||
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
|
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
|
||||||
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
|
use crate::vector::{Embedder, HannoyWrapper, RuntimeEmbedders};
|
||||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||||
|
|
||||||
pub(crate) mod de;
|
pub(crate) mod de;
|
||||||
@ -66,7 +66,7 @@ where
|
|||||||
let mut bbbuffers = Vec::new();
|
let mut bbbuffers = Vec::new();
|
||||||
let finished_extraction = AtomicBool::new(false);
|
let finished_extraction = AtomicBool::new(false);
|
||||||
|
|
||||||
let arroy_memory = grenad_parameters.max_memory;
|
let hannoy_memory = grenad_parameters.max_memory;
|
||||||
|
|
||||||
let (grenad_parameters, total_bbbuffer_capacity) =
|
let (grenad_parameters, total_bbbuffer_capacity) =
|
||||||
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
||||||
@ -129,8 +129,8 @@ where
|
|||||||
|
|
||||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
||||||
|
|
||||||
let vector_arroy = index.vector_arroy;
|
let vector_arroy = index.vector_hannoy;
|
||||||
let arroy_writers: Result<HashMap<_, _>> = embedders
|
let hannoy_writers: Result<HashMap<_, _>> = embedders
|
||||||
.inner_as_ref()
|
.inner_as_ref()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(embedder_name, runtime)| {
|
.map(|(embedder_name, runtime)| {
|
||||||
@ -143,7 +143,7 @@ where
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
let dimensions = runtime.embedder.dimensions();
|
let dimensions = runtime.embedder.dimensions();
|
||||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
|
let writer = HannoyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
embedder_index,
|
embedder_index,
|
||||||
@ -152,10 +152,10 @@ where
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mut arroy_writers = arroy_writers?;
|
let mut hannoy_writers = hannoy_writers?;
|
||||||
|
|
||||||
let congestion =
|
let congestion =
|
||||||
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?;
|
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
|
||||||
|
|
||||||
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
||||||
|
|
||||||
@ -169,8 +169,8 @@ where
|
|||||||
wtxn,
|
wtxn,
|
||||||
indexing_context.progress,
|
indexing_context.progress,
|
||||||
index_embeddings,
|
index_embeddings,
|
||||||
arroy_memory,
|
hannoy_memory,
|
||||||
&mut arroy_writers,
|
&mut hannoy_writers,
|
||||||
None,
|
None,
|
||||||
&indexing_context.must_stop_processing,
|
&indexing_context.must_stop_processing,
|
||||||
)
|
)
|
||||||
@ -226,7 +226,7 @@ where
|
|||||||
let mut bbbuffers = Vec::new();
|
let mut bbbuffers = Vec::new();
|
||||||
let finished_extraction = AtomicBool::new(false);
|
let finished_extraction = AtomicBool::new(false);
|
||||||
|
|
||||||
let arroy_memory = grenad_parameters.max_memory;
|
let hannoy_memory = grenad_parameters.max_memory;
|
||||||
|
|
||||||
let (grenad_parameters, total_bbbuffer_capacity) =
|
let (grenad_parameters, total_bbbuffer_capacity) =
|
||||||
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
||||||
@ -283,7 +283,7 @@ where
|
|||||||
let new_embedders = settings_delta.new_embedders();
|
let new_embedders = settings_delta.new_embedders();
|
||||||
let embedder_actions = settings_delta.embedder_actions();
|
let embedder_actions = settings_delta.embedder_actions();
|
||||||
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
|
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
|
||||||
let mut arroy_writers = arroy_writers_from_embedder_actions(
|
let mut hannoy_writers = hannoy_writers_from_embedder_actions(
|
||||||
index,
|
index,
|
||||||
embedder_actions,
|
embedder_actions,
|
||||||
new_embedders,
|
new_embedders,
|
||||||
@ -291,7 +291,7 @@ where
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
let congestion =
|
let congestion =
|
||||||
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?;
|
write_to_db(writer_receiver, finished_extraction, index, wtxn, &hannoy_writers)?;
|
||||||
|
|
||||||
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
||||||
|
|
||||||
@ -305,8 +305,8 @@ where
|
|||||||
wtxn,
|
wtxn,
|
||||||
indexing_context.progress,
|
indexing_context.progress,
|
||||||
index_embeddings,
|
index_embeddings,
|
||||||
arroy_memory,
|
hannoy_memory,
|
||||||
&mut arroy_writers,
|
&mut hannoy_writers,
|
||||||
Some(embedder_actions),
|
Some(embedder_actions),
|
||||||
&indexing_context.must_stop_processing,
|
&indexing_context.must_stop_processing,
|
||||||
)
|
)
|
||||||
@ -336,13 +336,13 @@ where
|
|||||||
Ok(congestion)
|
Ok(congestion)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn arroy_writers_from_embedder_actions<'indexer>(
|
fn hannoy_writers_from_embedder_actions<'indexer>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
|
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
|
||||||
embedders: &'indexer RuntimeEmbedders,
|
embedders: &'indexer RuntimeEmbedders,
|
||||||
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
|
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
|
||||||
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, ArroyWrapper, usize)>> {
|
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, HannoyWrapper, usize)>> {
|
||||||
let vector_arroy = index.vector_arroy;
|
let vector_arroy = index.vector_hannoy;
|
||||||
|
|
||||||
embedders
|
embedders
|
||||||
.inner_as_ref()
|
.inner_as_ref()
|
||||||
@ -361,7 +361,7 @@ fn arroy_writers_from_embedder_actions<'indexer>(
|
|||||||
)));
|
)));
|
||||||
};
|
};
|
||||||
let writer =
|
let writer =
|
||||||
ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized);
|
HannoyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized);
|
||||||
let dimensions = runtime.embedder.dimensions();
|
let dimensions = runtime.embedder.dimensions();
|
||||||
Some(Ok((
|
Some(Ok((
|
||||||
embedder_category_id,
|
embedder_category_id,
|
||||||
@ -384,7 +384,7 @@ where
|
|||||||
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
|
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized);
|
let reader = HannoyWrapper::new(index.vector_hannoy, *embedder_id, action.was_quantized);
|
||||||
let Some(dimensions) = reader.dimensions(wtxn)? else {
|
let Some(dimensions) = reader.dimensions(wtxn)? else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
@ -400,7 +400,7 @@ where
|
|||||||
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
|
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized);
|
let arroy = HannoyWrapper::new(index.vector_hannoy, infos.embedder_id, was_quantized);
|
||||||
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
||||||
continue;
|
continue;
|
||||||
};
|
};
|
||||||
|
@ -15,7 +15,7 @@ use crate::progress::Progress;
|
|||||||
use crate::update::settings::InnerIndexSettings;
|
use crate::update::settings::InnerIndexSettings;
|
||||||
use crate::vector::db::IndexEmbeddingConfig;
|
use crate::vector::db::IndexEmbeddingConfig;
|
||||||
use crate::vector::settings::EmbedderAction;
|
use crate::vector::settings::EmbedderAction;
|
||||||
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
|
use crate::vector::{Embedder, Embeddings, HannoyWrapper, RuntimeEmbedders};
|
||||||
use crate::{Error, Index, InternalError, Result, UserError};
|
use crate::{Error, Index, InternalError, Result, UserError};
|
||||||
|
|
||||||
pub fn write_to_db(
|
pub fn write_to_db(
|
||||||
@ -23,9 +23,9 @@ pub fn write_to_db(
|
|||||||
finished_extraction: &AtomicBool,
|
finished_extraction: &AtomicBool,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
arroy_writers: &HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
|
hannoy_writers: &HashMap<u8, (&str, &Embedder, HannoyWrapper, usize)>,
|
||||||
) -> Result<ChannelCongestion> {
|
) -> Result<ChannelCongestion> {
|
||||||
// Used by by the ArroySetVector to copy the embedding into an
|
// Used by by the HannoySetVector to copy the embedding into an
|
||||||
// aligned memory area, required by arroy to accept a new vector.
|
// aligned memory area, required by arroy to accept a new vector.
|
||||||
let mut aligned_embedding = Vec::new();
|
let mut aligned_embedding = Vec::new();
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "all");
|
let span = tracing::trace_span!(target: "indexing::write_db", "all");
|
||||||
@ -56,7 +56,7 @@ pub fn write_to_db(
|
|||||||
ReceiverAction::LargeVectors(large_vectors) => {
|
ReceiverAction::LargeVectors(large_vectors) => {
|
||||||
let LargeVectors { docid, embedder_id, .. } = large_vectors;
|
let LargeVectors { docid, embedder_id, .. } = large_vectors;
|
||||||
let (_, _, writer, dimensions) =
|
let (_, _, writer, dimensions) =
|
||||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
let mut embeddings = Embeddings::new(*dimensions);
|
let mut embeddings = Embeddings::new(*dimensions);
|
||||||
for embedding in large_vectors.read_embeddings(*dimensions) {
|
for embedding in large_vectors.read_embeddings(*dimensions) {
|
||||||
embeddings.push(embedding.to_vec()).unwrap();
|
embeddings.push(embedding.to_vec()).unwrap();
|
||||||
@ -68,7 +68,7 @@ pub fn write_to_db(
|
|||||||
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
|
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
|
||||||
) => {
|
) => {
|
||||||
let (_, _, writer, dimensions) =
|
let (_, _, writer, dimensions) =
|
||||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
let embedding = large_vector.read_embedding(*dimensions);
|
let embedding = large_vector.read_embedding(*dimensions);
|
||||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||||
}
|
}
|
||||||
@ -80,12 +80,12 @@ pub fn write_to_db(
|
|||||||
&mut writer_receiver,
|
&mut writer_receiver,
|
||||||
index,
|
index,
|
||||||
wtxn,
|
wtxn,
|
||||||
arroy_writers,
|
hannoy_writers,
|
||||||
&mut aligned_embedding,
|
&mut aligned_embedding,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?;
|
write_from_bbqueue(&mut writer_receiver, index, wtxn, hannoy_writers, &mut aligned_embedding)?;
|
||||||
|
|
||||||
Ok(ChannelCongestion {
|
Ok(ChannelCongestion {
|
||||||
attempts: writer_receiver.sent_messages_attempts(),
|
attempts: writer_receiver.sent_messages_attempts(),
|
||||||
@ -115,8 +115,8 @@ pub fn build_vectors<MSP>(
|
|||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
progress: &Progress,
|
progress: &Progress,
|
||||||
index_embeddings: Vec<IndexEmbeddingConfig>,
|
index_embeddings: Vec<IndexEmbeddingConfig>,
|
||||||
arroy_memory: Option<usize>,
|
hannoy_memory: Option<usize>,
|
||||||
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
|
hannoy_writers: &mut HashMap<u8, (&str, &Embedder, HannoyWrapper, usize)>,
|
||||||
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
|
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
|
||||||
must_stop_processing: &MSP,
|
must_stop_processing: &MSP,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -129,7 +129,7 @@ where
|
|||||||
|
|
||||||
let seed = rand::random();
|
let seed = rand::random();
|
||||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||||
for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers {
|
for (_index, (embedder_name, _embedder, writer, dimensions)) in hannoy_writers {
|
||||||
let dimensions = *dimensions;
|
let dimensions = *dimensions;
|
||||||
let is_being_quantized = embeder_actions
|
let is_being_quantized = embeder_actions
|
||||||
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
|
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
|
||||||
@ -140,7 +140,7 @@ where
|
|||||||
&mut rng,
|
&mut rng,
|
||||||
dimensions,
|
dimensions,
|
||||||
is_being_quantized,
|
is_being_quantized,
|
||||||
arroy_memory,
|
hannoy_memory,
|
||||||
must_stop_processing,
|
must_stop_processing,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -181,7 +181,7 @@ pub fn write_from_bbqueue(
|
|||||||
writer_receiver: &mut WriterBbqueueReceiver<'_>,
|
writer_receiver: &mut WriterBbqueueReceiver<'_>,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
arroy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, ArroyWrapper, usize)>,
|
hannoy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, HannoyWrapper, usize)>,
|
||||||
aligned_embedding: &mut Vec<f32>,
|
aligned_embedding: &mut Vec<f32>,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
while let Some(frame_with_header) = writer_receiver.recv_frame() {
|
while let Some(frame_with_header) = writer_receiver.recv_frame() {
|
||||||
@ -221,17 +221,17 @@ pub fn write_from_bbqueue(
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => {
|
EntryHeader::HannoyDeleteVector(HannoyDeleteVector { docid }) => {
|
||||||
for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers {
|
for (_index, (_name, _embedder, writer, dimensions)) in hannoy_writers {
|
||||||
let dimensions = *dimensions;
|
let dimensions = *dimensions;
|
||||||
writer.del_items(wtxn, dimensions, docid)?;
|
writer.del_items(wtxn, dimensions, docid)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EntryHeader::ArroySetVectors(asvs) => {
|
EntryHeader::HannoySetVectors(asvs) => {
|
||||||
let ArroySetVectors { docid, embedder_id, .. } = asvs;
|
let HannoySetVectors { docid, embedder_id, .. } = asvs;
|
||||||
let frame = frame_with_header.frame();
|
let frame = frame_with_header.frame();
|
||||||
let (_, _, writer, dimensions) =
|
let (_, _, writer, dimensions) =
|
||||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
let mut embeddings = Embeddings::new(*dimensions);
|
let mut embeddings = Embeddings::new(*dimensions);
|
||||||
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||||
writer.del_items(wtxn, *dimensions, docid)?;
|
writer.del_items(wtxn, *dimensions, docid)?;
|
||||||
@ -245,12 +245,12 @@ pub fn write_from_bbqueue(
|
|||||||
writer.add_items(wtxn, docid, &embeddings)?;
|
writer.add_items(wtxn, docid, &embeddings)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
EntryHeader::ArroySetVector(
|
EntryHeader::HannoySetVector(
|
||||||
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
|
asv @ HannoySetVector { docid, embedder_id, extractor_id, .. },
|
||||||
) => {
|
) => {
|
||||||
let frame = frame_with_header.frame();
|
let frame = frame_with_header.frame();
|
||||||
let (_, _, writer, dimensions) =
|
let (_, _, writer, dimensions) =
|
||||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
hannoy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||||
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
|
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||||
|
|
||||||
if embedding.is_empty() {
|
if embedding.is_empty() {
|
||||||
|
@ -14,7 +14,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME;
|
|||||||
use crate::documents::FieldIdMapper;
|
use crate::documents::FieldIdMapper;
|
||||||
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
|
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
|
||||||
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
|
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
|
||||||
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
|
use crate::vector::{Embedding, HannoyWrapper, RuntimeEmbedders};
|
||||||
use crate::{DocumentId, Index, InternalError, Result, UserError};
|
use crate::{DocumentId, Index, InternalError, Result, UserError};
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
@ -121,7 +121,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
|||||||
status: &EmbeddingStatus,
|
status: &EmbeddingStatus,
|
||||||
) -> Result<VectorEntry<'t>> {
|
) -> Result<VectorEntry<'t>> {
|
||||||
let reader =
|
let reader =
|
||||||
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
|
HannoyWrapper::new(self.index.vector_hannoy, embedder_id, config.config.quantized());
|
||||||
let vectors = reader.item_vectors(self.rtxn, self.docid)?;
|
let vectors = reader.item_vectors(self.rtxn, self.docid)?;
|
||||||
|
|
||||||
Ok(VectorEntry {
|
Ok(VectorEntry {
|
||||||
@ -149,7 +149,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
|||||||
name,
|
name,
|
||||||
entry_from_raw_value(value, false).map_err(|_| {
|
entry_from_raw_value(value, false).map_err(|_| {
|
||||||
InternalError::Serialization(crate::SerializationError::Decoding {
|
InternalError::Serialization(crate::SerializationError::Decoding {
|
||||||
db_name: Some(crate::index::db_name::VECTOR_ARROY),
|
db_name: Some(crate::index::db_name::VECTOR_HANNOY),
|
||||||
})
|
})
|
||||||
})?,
|
})?,
|
||||||
))
|
))
|
||||||
@ -167,7 +167,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
|||||||
Some(embedding_from_doc) => {
|
Some(embedding_from_doc) => {
|
||||||
Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| {
|
Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| {
|
||||||
InternalError::Serialization(crate::SerializationError::Decoding {
|
InternalError::Serialization(crate::SerializationError::Decoding {
|
||||||
db_name: Some(crate::index::db_name::VECTOR_ARROY),
|
db_name: Some(crate::index::db_name::VECTOR_HANNOY),
|
||||||
})
|
})
|
||||||
})?)
|
})?)
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
use arroy::distances::Cosine;
|
use hannoy::distances::Cosine;
|
||||||
use heed::RwTxn;
|
use heed::RwTxn;
|
||||||
|
|
||||||
use super::UpgradeIndex;
|
use super::UpgradeIndex;
|
||||||
@ -25,12 +25,13 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 {
|
|||||||
progress.update_progress(VectorStore::UpdateInternalVersions);
|
progress.update_progress(VectorStore::UpdateInternalVersions);
|
||||||
|
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
arroy::upgrade::from_0_5_to_0_6::<Cosine>(
|
// hannoy::upgrade::from_0_5_to_0_6::<Cosine>(
|
||||||
&rtxn,
|
// &rtxn,
|
||||||
index.vector_arroy.remap_data_type(),
|
// index.vector_hannoy.remap_data_type(),
|
||||||
wtxn,
|
// wtxn,
|
||||||
index.vector_arroy.remap_data_type(),
|
// index.vector_hannoy.remap_data_type(),
|
||||||
)?;
|
// )?;
|
||||||
|
unimplemented!("upgrade hannoy");
|
||||||
|
|
||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use arroy::Distance;
|
use hannoy::Distance;
|
||||||
|
|
||||||
use super::error::CompositeEmbedderContainsHuggingFace;
|
use super::error::CompositeEmbedderContainsHuggingFace;
|
||||||
use super::{
|
use super::{
|
||||||
@ -307,19 +307,18 @@ fn check_similarity(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (left, right) in left.into_iter().zip(right) {
|
for (left, right) in left.into_iter().zip(right) {
|
||||||
let left = arroy::internals::UnalignedVector::from_slice(&left);
|
let left = hannoy::internals::UnalignedVector::from_slice(&left);
|
||||||
let right = arroy::internals::UnalignedVector::from_slice(&right);
|
let right = hannoy::internals::UnalignedVector::from_slice(&right);
|
||||||
let left = arroy::internals::Leaf {
|
let left = hannoy::internals::Item {
|
||||||
header: arroy::distances::Cosine::new_header(&left),
|
header: hannoy::distances::Cosine::new_header(&left),
|
||||||
vector: left,
|
vector: left,
|
||||||
};
|
};
|
||||||
let right = arroy::internals::Leaf {
|
let right = hannoy::internals::Item {
|
||||||
header: arroy::distances::Cosine::new_header(&right),
|
header: hannoy::distances::Cosine::new_header(&right),
|
||||||
vector: right,
|
vector: right,
|
||||||
};
|
};
|
||||||
|
|
||||||
let distance = arroy::distances::Cosine::built_distance(&left, &right);
|
let distance = hannoy::distances::Cosine::distance(&left, &right);
|
||||||
|
|
||||||
if distance > super::MAX_COMPOSITE_DISTANCE {
|
if distance > super::MAX_COMPOSITE_DISTANCE {
|
||||||
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
|
return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint));
|
||||||
}
|
}
|
||||||
|
@ -3,9 +3,9 @@ use std::num::NonZeroUsize;
|
|||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use arroy::distances::{BinaryQuantizedCosine, Cosine};
|
|
||||||
use arroy::ItemId;
|
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
|
use hannoy::distances::{BinaryQuantizedCosine, Cosine};
|
||||||
|
use hannoy::ItemId;
|
||||||
use heed::{RoTxn, RwTxn, Unspecified};
|
use heed::{RoTxn, RwTxn, Unspecified};
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -41,15 +41,15 @@ pub type Embedding = Vec<f32>;
|
|||||||
pub const REQUEST_PARALLELISM: usize = 40;
|
pub const REQUEST_PARALLELISM: usize = 40;
|
||||||
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
|
pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01;
|
||||||
|
|
||||||
pub struct ArroyWrapper {
|
pub struct HannoyWrapper {
|
||||||
quantized: bool,
|
quantized: bool,
|
||||||
embedder_index: u8,
|
embedder_index: u8,
|
||||||
database: arroy::Database<Unspecified>,
|
database: hannoy::Database<Unspecified>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArroyWrapper {
|
impl HannoyWrapper {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
database: arroy::Database<Unspecified>,
|
database: hannoy::Database<Unspecified>,
|
||||||
embedder_index: u8,
|
embedder_index: u8,
|
||||||
quantized: bool,
|
quantized: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
@ -60,19 +60,19 @@ impl ArroyWrapper {
|
|||||||
self.embedder_index
|
self.embedder_index
|
||||||
}
|
}
|
||||||
|
|
||||||
fn readers<'a, D: arroy::Distance>(
|
fn readers<'a, D: hannoy::Distance>(
|
||||||
&'a self,
|
&'a self,
|
||||||
rtxn: &'a RoTxn<'a>,
|
rtxn: &'a RoTxn<'a>,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a {
|
) -> impl Iterator<Item = Result<hannoy::Reader<'a, D>, hannoy::Error>> + 'a {
|
||||||
arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
hannoy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
||||||
match arroy::Reader::open(rtxn, index, db) {
|
match hannoy::Reader::open(rtxn, index, db) {
|
||||||
Ok(reader) => match reader.is_empty(rtxn) {
|
Ok(reader) => match reader.is_empty(rtxn) {
|
||||||
Ok(false) => Some(Ok(reader)),
|
Ok(false) => Some(Ok(reader)),
|
||||||
Ok(true) => None,
|
Ok(true) => None,
|
||||||
Err(e) => Some(Err(e)),
|
Err(e) => Some(Err(e)),
|
||||||
},
|
},
|
||||||
Err(arroy::Error::MissingMetadata(_)) => None,
|
Err(hannoy::Error::MissingMetadata(_)) => None,
|
||||||
Err(e) => Some(Err(e)),
|
Err(e) => Some(Err(e)),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -86,7 +86,7 @@ impl ArroyWrapper {
|
|||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
with_items: F,
|
with_items: F,
|
||||||
) -> Result<O, arroy::Error>
|
) -> Result<O, hannoy::Error>
|
||||||
where
|
where
|
||||||
F: FnOnce(&RoaringBitmap) -> O,
|
F: FnOnce(&RoaringBitmap) -> O,
|
||||||
{
|
{
|
||||||
@ -97,26 +97,26 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _items_in_store<D: arroy::Distance, F, O>(
|
fn _items_in_store<D: hannoy::Distance, F, O>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
with_items: F,
|
with_items: F,
|
||||||
) -> Result<O, arroy::Error>
|
) -> Result<O, hannoy::Error>
|
||||||
where
|
where
|
||||||
F: FnOnce(&RoaringBitmap) -> O,
|
F: FnOnce(&RoaringBitmap) -> O,
|
||||||
{
|
{
|
||||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
let index = hannoy_store_for_embedder(self.embedder_index, store_id);
|
||||||
let reader = arroy::Reader::open(rtxn, index, db);
|
let reader = hannoy::Reader::open(rtxn, index, db);
|
||||||
match reader {
|
match reader {
|
||||||
Ok(reader) => Ok(with_items(reader.item_ids())),
|
Ok(reader) => Ok(with_items(reader.item_ids())),
|
||||||
Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())),
|
Err(hannoy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())),
|
||||||
Err(err) => Err(err),
|
Err(err) => Err(err),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, arroy::Error> {
|
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
Ok(self
|
Ok(self
|
||||||
.readers(rtxn, self.quantized_db())
|
.readers(rtxn, self.quantized_db())
|
||||||
@ -140,39 +140,40 @@ impl ArroyWrapper {
|
|||||||
rng: &mut R,
|
rng: &mut R,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
quantizing: bool,
|
quantizing: bool,
|
||||||
arroy_memory: Option<usize>,
|
hannoy_memory: Option<usize>,
|
||||||
cancel: &(impl Fn() -> bool + Sync + Send),
|
cancel: &(impl Fn() -> bool + Sync + Send),
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
if writer.need_build(wtxn)? {
|
if writer.need_build(wtxn)? {
|
||||||
writer.builder(rng).build(wtxn)?
|
writer.builder(rng).build::<16, 32>(wtxn)?
|
||||||
} else if writer.is_empty(wtxn)? {
|
} else if writer.is_empty(wtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
|
||||||
// If we are quantizing the databases, we can't know from meilisearch
|
// If we are quantizing the databases, we can't know from meilisearch
|
||||||
// if the db was empty but still contained the wrong metadata, thus we need
|
// if the db was empty but still contained the wrong metadata, thus we need
|
||||||
// to quantize everything and can't stop early. Since this operation can
|
// to quantize everything and can't stop early. Since this operation can
|
||||||
// only happens once in the life of an embedder, it's not very performances
|
// only happens once in the life of an embedder, it's not very performances
|
||||||
// sensitive.
|
// sensitive.
|
||||||
if quantizing && !self.quantized {
|
if quantizing && !self.quantized {
|
||||||
let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
|
// let writer = writer.prepare_changing_distance::<BinaryQuantizedCosine>(wtxn)?;
|
||||||
writer
|
// writer
|
||||||
.builder(rng)
|
// .builder(rng)
|
||||||
.available_memory(arroy_memory.unwrap_or(usize::MAX))
|
// .available_memory(hannoy_memory.unwrap_or(usize::MAX))
|
||||||
.progress(|step| progress.update_progress_from_arroy(step))
|
// .progress(|step| progress.update_progress_from_hannoy(step))
|
||||||
.cancel(cancel)
|
// .cancel(cancel)
|
||||||
.build(wtxn)?;
|
// .build(wtxn)?;
|
||||||
|
unimplemented!("switching from quantized to non-quantized");
|
||||||
} else if writer.need_build(wtxn)? {
|
} else if writer.need_build(wtxn)? {
|
||||||
writer
|
writer
|
||||||
.builder(rng)
|
.builder(rng)
|
||||||
.available_memory(arroy_memory.unwrap_or(usize::MAX))
|
.available_memory(hannoy_memory.unwrap_or(usize::MAX))
|
||||||
.progress(|step| progress.update_progress_from_arroy(step))
|
// .progress(|step| progress.update_progress_from_hannoy(step))
|
||||||
.cancel(cancel)
|
// .cancel(cancel)
|
||||||
.build(wtxn)?;
|
.build::<16, 32>(wtxn)?;
|
||||||
} else if writer.is_empty(wtxn)? {
|
} else if writer.is_empty(wtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -188,18 +189,18 @@ impl ArroyWrapper {
|
|||||||
pub fn add_items(
|
pub fn add_items(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
embeddings: &Embeddings<f32>,
|
embeddings: &Embeddings<f32>,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
let dimension = embeddings.dimension();
|
let dimension = embeddings.dimension();
|
||||||
for (index, vector) in
|
for (index, vector) in
|
||||||
arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
hannoy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||||
{
|
{
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
arroy::Writer::new(self.quantized_db(), index, dimension)
|
hannoy::Writer::new(self.quantized_db(), index, dimension)
|
||||||
.add_item(wtxn, item_id, vector)?
|
.add_item(wtxn, item_id, vector)?
|
||||||
} else {
|
} else {
|
||||||
arroy::Writer::new(self.angular_db(), index, dimension)
|
hannoy::Writer::new(self.angular_db(), index, dimension)
|
||||||
.add_item(wtxn, item_id, vector)?
|
.add_item(wtxn, item_id, vector)?
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -210,9 +211,9 @@ impl ArroyWrapper {
|
|||||||
pub fn add_item(
|
pub fn add_item(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._add_item(wtxn, self.quantized_db(), item_id, vector)
|
self._add_item(wtxn, self.quantized_db(), item_id, vector)
|
||||||
} else {
|
} else {
|
||||||
@ -220,17 +221,17 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _add_item<D: arroy::Distance>(
|
fn _add_item<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
let dimension = vector.len();
|
let dimension = vector.len();
|
||||||
|
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
let writer = arroy::Writer::new(db, index, dimension);
|
let writer = hannoy::Writer::new(db, index, dimension);
|
||||||
if !writer.contains_item(wtxn, item_id)? {
|
if !writer.contains_item(wtxn, item_id)? {
|
||||||
writer.add_item(wtxn, item_id, vector)?;
|
writer.add_item(wtxn, item_id, vector)?;
|
||||||
break;
|
break;
|
||||||
@ -245,10 +246,10 @@ impl ArroyWrapper {
|
|||||||
pub fn add_item_in_store(
|
pub fn add_item_in_store(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
|
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
|
||||||
} else {
|
} else {
|
||||||
@ -256,18 +257,18 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _add_item_in_store<D: arroy::Distance>(
|
fn _add_item_in_store<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
let dimension = vector.len();
|
let dimension = vector.len();
|
||||||
|
|
||||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
let index = hannoy_store_for_embedder(self.embedder_index, store_id);
|
||||||
let writer = arroy::Writer::new(db, index, dimension);
|
let writer = hannoy::Writer::new(db, index, dimension);
|
||||||
writer.add_item(wtxn, item_id, vector)
|
writer.add_item(wtxn, item_id, vector)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -276,14 +277,14 @@ impl ArroyWrapper {
|
|||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
writer.del_item(wtxn, item_id)?;
|
writer.del_item(wtxn, item_id)?;
|
||||||
} else {
|
} else {
|
||||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
|
||||||
writer.del_item(wtxn, item_id)?;
|
writer.del_item(wtxn, item_id)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -301,10 +302,10 @@ impl ArroyWrapper {
|
|||||||
pub fn del_item_in_store(
|
pub fn del_item_in_store(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
dimensions: usize,
|
dimensions: usize,
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
|
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
|
||||||
} else {
|
} else {
|
||||||
@ -312,16 +313,16 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _del_item_in_store<D: arroy::Distance>(
|
fn _del_item_in_store<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
dimensions: usize,
|
dimensions: usize,
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, hannoy::Error> {
|
||||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
let index = hannoy_store_for_embedder(self.embedder_index, store_id);
|
||||||
let writer = arroy::Writer::new(db, index, dimensions);
|
let writer = hannoy::Writer::new(db, index, dimensions);
|
||||||
writer.del_item(wtxn, item_id)
|
writer.del_item(wtxn, item_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -335,7 +336,7 @@ impl ArroyWrapper {
|
|||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
dimensions: usize,
|
dimensions: usize,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
|
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
|
||||||
} else {
|
} else {
|
||||||
@ -343,15 +344,15 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _clear_store<D: arroy::Distance>(
|
fn _clear_store<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
store_id: u8,
|
store_id: u8,
|
||||||
dimensions: usize,
|
dimensions: usize,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
let index = hannoy_store_for_embedder(self.embedder_index, store_id);
|
||||||
let writer = arroy::Writer::new(db, index, dimensions);
|
let writer = hannoy::Writer::new(db, index, dimensions);
|
||||||
writer.clear(wtxn)
|
writer.clear(wtxn)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -359,9 +360,9 @@ impl ArroyWrapper {
|
|||||||
pub fn del_item(
|
pub fn del_item(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._del_item(wtxn, self.quantized_db(), item_id, vector)
|
self._del_item(wtxn, self.quantized_db(), item_id, vector)
|
||||||
} else {
|
} else {
|
||||||
@ -369,37 +370,34 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _del_item<D: arroy::Distance>(
|
fn _del_item<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
item_id: arroy::ItemId,
|
item_id: hannoy::ItemId,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, hannoy::Error> {
|
||||||
let dimension = vector.len();
|
let dimension = vector.len();
|
||||||
|
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
let writer = arroy::Writer::new(db, index, dimension);
|
let writer = hannoy::Writer::new(db, index, dimension);
|
||||||
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
|
if writer.contains_item(wtxn, item_id)? {
|
||||||
continue;
|
|
||||||
};
|
|
||||||
if candidate == vector {
|
|
||||||
return writer.del_item(wtxn, item_id);
|
return writer.del_item(wtxn, item_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(false)
|
Ok(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
|
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), hannoy::Error> {
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
if writer.is_empty(wtxn)? {
|
if writer.is_empty(wtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
writer.clear(wtxn)?;
|
writer.clear(wtxn)?;
|
||||||
} else {
|
} else {
|
||||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
|
||||||
if writer.is_empty(wtxn)? {
|
if writer.is_empty(wtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -413,17 +411,17 @@ impl ArroyWrapper {
|
|||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
dimension: usize,
|
dimension: usize,
|
||||||
item: arroy::ItemId,
|
item: hannoy::ItemId,
|
||||||
) -> Result<bool, arroy::Error> {
|
) -> Result<bool, hannoy::Error> {
|
||||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
for index in hannoy_store_range_for_embedder(self.embedder_index) {
|
||||||
let contains = if self.quantized {
|
let contains = if self.quantized {
|
||||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.quantized_db(), index, dimension);
|
||||||
if writer.is_empty(rtxn)? {
|
if writer.is_empty(rtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
writer.contains_item(rtxn, item)?
|
writer.contains_item(rtxn, item)?
|
||||||
} else {
|
} else {
|
||||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
let writer = hannoy::Writer::new(self.angular_db(), index, dimension);
|
||||||
if writer.is_empty(rtxn)? {
|
if writer.is_empty(rtxn)? {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -442,7 +440,7 @@ impl ArroyWrapper {
|
|||||||
item: ItemId,
|
item: ItemId,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter)
|
self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter)
|
||||||
} else {
|
} else {
|
||||||
@ -450,24 +448,25 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _nns_by_item<D: arroy::Distance>(
|
fn _nns_by_item<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
item: ItemId,
|
item: ItemId,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
for reader in self.readers(rtxn, db) {
|
for reader in self.readers(rtxn, db) {
|
||||||
let reader = reader?;
|
let reader = reader?;
|
||||||
let mut searcher = reader.nns(limit);
|
let searcher = reader.nns(limit, limit * 2); // TODO find better ef
|
||||||
if let Some(filter) = filter {
|
if let Some(filter) = filter {
|
||||||
if reader.item_ids().is_disjoint(filter) {
|
if reader.item_ids().is_disjoint(filter) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
searcher.candidates(filter);
|
unimplemented!("Hannoy doesn't support filtering");
|
||||||
|
// searcher.candidates(filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
|
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
|
||||||
@ -484,7 +483,7 @@ impl ArroyWrapper {
|
|||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter)
|
self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter)
|
||||||
} else {
|
} else {
|
||||||
@ -492,24 +491,25 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _nns_by_vector<D: arroy::Distance>(
|
fn _nns_by_vector<D: hannoy::Distance>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
db: arroy::Database<D>,
|
db: hannoy::Database<D>,
|
||||||
vector: &[f32],
|
vector: &[f32],
|
||||||
limit: usize,
|
limit: usize,
|
||||||
filter: Option<&RoaringBitmap>,
|
filter: Option<&RoaringBitmap>,
|
||||||
) -> Result<Vec<(ItemId, f32)>, arroy::Error> {
|
) -> Result<Vec<(ItemId, f32)>, hannoy::Error> {
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
for reader in self.readers(rtxn, db) {
|
for reader in self.readers(rtxn, db) {
|
||||||
let reader = reader?;
|
let reader = reader?;
|
||||||
let mut searcher = reader.nns(limit);
|
let searcher = reader.nns(limit, limit * 2); // TODO find better ef
|
||||||
if let Some(filter) = filter {
|
if let Some(filter) = filter {
|
||||||
if reader.item_ids().is_disjoint(filter) {
|
if reader.item_ids().is_disjoint(filter) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
searcher.candidates(filter);
|
unimplemented!("Hannoy doesn't support filteting");
|
||||||
|
// searcher.candidates(filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
results.append(&mut searcher.by_vector(rtxn, vector)?);
|
results.append(&mut searcher.by_vector(rtxn, vector)?);
|
||||||
@ -520,7 +520,7 @@ impl ArroyWrapper {
|
|||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, arroy::Error> {
|
pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result<Vec<Vec<f32>>, hannoy::Error> {
|
||||||
let mut vectors = Vec::new();
|
let mut vectors = Vec::new();
|
||||||
|
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
@ -539,19 +539,19 @@ impl ArroyWrapper {
|
|||||||
Ok(vectors)
|
Ok(vectors)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn angular_db(&self) -> arroy::Database<Cosine> {
|
fn angular_db(&self) -> hannoy::Database<Cosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
fn quantized_db(&self) -> hannoy::Database<BinaryQuantizedCosine> {
|
||||||
self.database.remap_data_type()
|
self.database.remap_data_type()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn aggregate_stats(
|
pub fn aggregate_stats(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
stats: &mut ArroyStats,
|
stats: &mut HannoyStats,
|
||||||
) -> Result<(), arroy::Error> {
|
) -> Result<(), hannoy::Error> {
|
||||||
if self.quantized {
|
if self.quantized {
|
||||||
for reader in self.readers(rtxn, self.quantized_db()) {
|
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||||
let reader = reader?;
|
let reader = reader?;
|
||||||
@ -579,10 +579,11 @@ impl ArroyWrapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, Clone)]
|
#[derive(Debug, Default, Clone)]
|
||||||
pub struct ArroyStats {
|
pub struct HannoyStats {
|
||||||
pub number_of_embeddings: u64,
|
pub number_of_embeddings: u64,
|
||||||
pub documents: RoaringBitmap,
|
pub documents: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// One or multiple embeddings stored consecutively in a flat vector.
|
/// One or multiple embeddings stored consecutively in a flat vector.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub struct Embeddings<F> {
|
pub struct Embeddings<F> {
|
||||||
@ -1208,11 +1209,11 @@ pub const fn is_cuda_enabled() -> bool {
|
|||||||
cfg!(feature = "cuda")
|
cfg!(feature = "cuda")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
fn hannoy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||||
(0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id))
|
(0..=u8::MAX).map(move |store_id| hannoy_store_for_embedder(embedder_id, store_id))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
|
fn hannoy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
|
||||||
let embedder_id = (embedder_id as u16) << 8;
|
let embedder_id = (embedder_id as u16) << 8;
|
||||||
embedder_id | (store_id as u16)
|
embedder_id | (store_id as u16)
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user