This commit is contained in:
Louis Dureuil
2023-12-07 17:03:10 +01:00
parent dde3a04679
commit cb4ebe163e
8 changed files with 185 additions and 157 deletions

2
Cargo.lock generated
View File

@@ -383,7 +383,7 @@ dependencies = [
[[package]] [[package]]
name = "arroy" name = "arroy"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/meilisearch/arroy.git#4b59476f457e5443ff250ea10d40d8b66a692674" source = "git+https://github.com/meilisearch/arroy.git#0079af0ec960bc9c51dd66e898a6b5e980cbb083"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",

View File

@@ -61,6 +61,8 @@ pub enum InternalError {
AbortedIndexation, AbortedIndexation,
#[error("The matching words list contains at least one invalid member.")] #[error("The matching words list contains at least one invalid member.")]
InvalidMatchingWords, InvalidMatchingWords,
#[error(transparent)]
ArroyError(#[from] arroy::Error),
} }
#[derive(Error, Debug)] #[derive(Error, Debug)]
@@ -190,6 +192,24 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
} }
impl From<arroy::Error> for Error {
fn from(value: arroy::Error) -> Self {
match value {
arroy::Error::Heed(heed) => heed.into(),
arroy::Error::Io(io) => io.into(),
arroy::Error::InvalidVecDimension { expected, received } => {
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
}
arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. }
| arroy::Error::MissingMetadata => {
Error::InternalError(InternalError::ArroyError(value))
}
}
}
}
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum GeoError { pub enum GeoError {
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]

View File

@@ -70,7 +70,6 @@ pub mod main_key {
pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by";
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
pub const PROXIMITY_PRECISION: &str = "proximity-precision"; pub const PROXIMITY_PRECISION: &str = "proximity-precision";
pub const VECTOR_UNAVAILABLE_VECTOR_IDS: &str = "vector-unavailable-vector-ids";
pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
} }
@@ -97,8 +96,6 @@ pub mod db_name {
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst"; pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
pub const VECTOR_DOCID_IDS: &str = "vector-docid-ids";
pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id";
pub const VECTOR_ARROY: &str = "vector-arroy"; pub const VECTOR_ARROY: &str = "vector-arroy";
pub const DOCUMENTS: &str = "documents"; pub const DOCUMENTS: &str = "documents";
@@ -167,16 +164,10 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings. /// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>, pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps a vector id to its document id.
pub vector_id_docid: Database<BEU32, BEU32>,
/// Maps a doc id to its vector ids.
pub docid_vector_ids: Database<BEU32, CboRoaringBitmapCodec>,
/// Maps an embedder name to its id in the arroy store. /// Maps an embedder name to its id in the arroy store.
pub embedder_category_id: Database<Str, BEU16>, pub embedder_category_id: Database<Str, U8>,
/// Vector store based on arroy™. /// Vector store based on arroy™.
pub vector_arroy: arroy::Database<arroy::distances::DotProduct>, pub vector_arroy: arroy::Database<arroy::distances::Angular>,
/// Maps the document id to the document as an obkv store. /// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<BEU32, ObkvCodec>, pub(crate) documents: Database<BEU32, ObkvCodec>,
@@ -191,7 +182,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(27); options.max_dbs(25);
let env = options.open(path)?; let env = options.open(path)?;
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;
@@ -232,8 +223,6 @@ impl Index {
let field_id_docid_facet_strings = let field_id_docid_facet_strings =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?; env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
// vector stuff // vector stuff
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
let docid_vector_ids = env.create_database(&mut wtxn, Some(VECTOR_DOCID_IDS))?;
let embedder_category_id = let embedder_category_id =
env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?;
let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?;
@@ -267,9 +256,7 @@ impl Index {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_id_docid,
vector_arroy, vector_arroy,
docid_vector_ids,
embedder_category_id, embedder_category_id,
documents, documents,
}) })
@@ -1516,30 +1503,6 @@ impl Index {
.get(rtxn, main_key::EMBEDDING_CONFIGS)? .get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default()) .unwrap_or_default())
} }
pub(crate) fn put_unavailable_vector_ids(
&self,
wtxn: &mut RwTxn<'_>,
unavailable_vector_ids: RoaringBitmap,
) -> heed::Result<()> {
self.main.remap_types::<Str, CboRoaringBitmapCodec>().put(
wtxn,
main_key::VECTOR_UNAVAILABLE_VECTOR_IDS,
&unavailable_vector_ids,
)
}
pub(crate) fn delete_unavailable_vector_ids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)
}
pub fn unavailable_vector_ids(&self, rtxn: &RoTxn<'_>) -> Result<RoaringBitmap> {
Ok(self
.main
.remap_types::<Str, CboRoaringBitmapCodec>()
.get(rtxn, main_key::VECTOR_UNAVAILABLE_VECTOR_IDS)?
.unwrap_or_default())
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -262,6 +262,7 @@ fn get_ranking_rules_for_vector<'ctx>(
ctx: &SearchContext<'ctx>, ctx: &SearchContext<'ctx>,
sort_criteria: &Option<Vec<AscDesc>>, sort_criteria: &Option<Vec<AscDesc>>,
geo_strategy: geo_sort::Strategy, geo_strategy: geo_sort::Strategy,
limit_plus_offset: usize,
target: &[f32], target: &[f32],
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> { ) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
// query graph search // query graph search
@@ -283,7 +284,12 @@ fn get_ranking_rules_for_vector<'ctx>(
| crate::Criterion::Exactness => { | crate::Criterion::Exactness => {
if !vector { if !vector {
let vector_candidates = ctx.index.documents_ids(ctx.txn)?; let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
let vector_sort = VectorSort::new(ctx, target.to_vec(), vector_candidates)?; let vector_sort = VectorSort::new(
ctx,
target.to_vec(),
vector_candidates,
limit_plus_offset,
)?;
ranking_rules.push(Box::new(vector_sort)); ranking_rules.push(Box::new(vector_sort));
vector = true; vector = true;
} }
@@ -509,7 +515,8 @@ pub fn execute_vector_search(
/// FIXME: input universe = universe & documents_with_vectors /// FIXME: input universe = universe & documents_with_vectors
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe // for now if we're computing embeddings for ALL documents, we can assume that this is just universe
let ranking_rules = get_ranking_rules_for_vector(ctx, sort_criteria, geo_strategy, vector)?; let ranking_rules =
get_ranking_rules_for_vector(ctx, sort_criteria, geo_strategy, from + length, vector)?;
let mut placeholder_search_logger = logger::DefaultSearchLogger; let mut placeholder_search_logger = logger::DefaultSearchLogger;
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> = let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =

View File

@@ -1,48 +1,83 @@
use std::future::Future;
use std::iter::FromIterator; use std::iter::FromIterator;
use std::pin::Pin;
use nolife::DynBoxScope; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
use crate::distance::NDotProductPoint;
use crate::index::Hnsw;
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::{Result, SearchContext, SearchLogger, UserError}; use crate::{DocumentId, Result, SearchContext, SearchLogger};
pub struct VectorSort<'ctx, Q: RankingRuleQueryTrait> { pub struct VectorSort<Q: RankingRuleQueryTrait> {
query: Option<Q>, query: Option<Q>,
target: Vec<f32>, target: Vec<f32>,
vector_candidates: RoaringBitmap, vector_candidates: RoaringBitmap,
reader: arroy::Reader<'ctx, arroy::distances::DotProduct>, cached_sorted_docids: std::vec::IntoIter<(DocumentId, f32, Vec<f32>)>,
limit: usize, limit: usize,
} }
impl<'ctx, Q: RankingRuleQueryTrait> VectorSort<'ctx, Q> { impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
pub fn new( pub fn new(
ctx: &'ctx SearchContext, _ctx: &SearchContext,
target: Vec<f32>, target: Vec<f32>,
vector_candidates: RoaringBitmap, vector_candidates: RoaringBitmap,
limit: usize, limit: usize,
) -> Result<Self> { ) -> Result<Self> {
/// FIXME? what to do in case of missing metadata Ok(Self {
let reader = arroy::Reader::open(ctx.txn, 0, ctx.index.vector_arroy)?; query: None,
target,
vector_candidates,
cached_sorted_docids: Default::default(),
limit,
})
}
let target_clone = target.clone(); fn fill_buffer(&mut self, ctx: &mut SearchContext<'_>) -> Result<()> {
let readers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
.map_while(|k| {
arroy::Reader::open(ctx.txn, k.into(), ctx.index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose()
})
.collect();
Ok(Self { query: None, target, vector_candidates, reader, limit }) let readers = readers?;
let target = &self.target;
let mut results = Vec::new();
for reader in readers.iter() {
let nns_by_vector = reader.nns_by_vector(
ctx.txn,
&target,
self.limit,
None,
Some(&self.vector_candidates),
)?;
let vectors: std::result::Result<Vec<_>, _> = nns_by_vector
.iter()
.map(|(docid, _)| reader.item_vector(ctx.txn, *docid).transpose().unwrap())
.collect();
let vectors = vectors?;
results.extend(nns_by_vector.into_iter().zip(vectors).map(|((x, y), z)| (x, y, z)));
}
results.sort_unstable_by_key(|(_, distance, _)| OrderedFloat(*distance));
self.cached_sorted_docids = results.into_iter();
Ok(())
} }
} }
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q> { impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
fn id(&self) -> String { fn id(&self) -> String {
"vector_sort".to_owned() "vector_sort".to_owned()
} }
fn start_iteration( fn start_iteration(
&mut self, &mut self,
_ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
_logger: &mut dyn SearchLogger<Q>, _logger: &mut dyn SearchLogger<Q>,
universe: &RoaringBitmap, universe: &RoaringBitmap,
query: &Q, query: &Q,
@@ -51,7 +86,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q
self.query = Some(query.clone()); self.query = Some(query.clone());
self.vector_candidates &= universe; self.vector_candidates &= universe;
self.fill_buffer(ctx)?;
Ok(()) Ok(())
} }
@@ -75,40 +110,24 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<'ctx, Q
}), }),
})); }));
} }
let target = &self.target;
let vector_candidates = &self.vector_candidates;
let result = self.reader.nns_by_vector(ctx.txn, &target, count, search_k, candidates) while let Some((docid, distance, vector)) = self.cached_sorted_docids.next() {
if self.vector_candidates.contains(docid) {
scope.enter(|it| { return Ok(Some(RankingRuleOutput {
for item in it.by_ref() { query,
let item: Item = item; candidates: RoaringBitmap::from_iter([docid]),
let index = item.pid.into_inner(); score: ScoreDetails::Vector(score_details::Vector {
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap(); target_vector: self.target.clone(),
value_similarity: Some((vector, 1.0 - distance)),
if vector_candidates.contains(docid) { }),
return Ok(Some(RankingRuleOutput { }));
query,
candidates: RoaringBitmap::from_iter([docid]),
score: ScoreDetails::Vector(score_details::Vector {
target_vector: target.clone(),
value_similarity: Some((
item.point.clone().into_inner(),
1.0 - item.distance,
)),
}),
}));
}
} }
Ok(Some(RankingRuleOutput { }
query,
candidates: universe.clone(), // if we got out of this loop it means we've exhausted our cache.
score: ScoreDetails::Vector(score_details::Vector { // we need to refill it and run the function again.
target_vector: target.clone(), self.fill_buffer(ctx)?;
value_similarity: None, self.next_bucket(ctx, _logger, universe)
}),
}))
})
} }
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) { fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {

View File

@@ -42,9 +42,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
facet_id_is_empty_docids, facet_id_is_empty_docids,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id_docid_facet_strings, field_id_docid_facet_strings,
vector_id_docid,
vector_arroy, vector_arroy,
docid_vector_ids,
embedder_category_id: _, embedder_category_id: _,
documents, documents,
} = self.index; } = self.index;
@@ -86,8 +84,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
field_id_docid_facet_strings.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?;
// vector // vector
vector_arroy.clear(self.wtxn)?; vector_arroy.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
docid_vector_ids.clear(self.wtxn)?;
documents.clear(self.wtxn)?; documents.clear(self.wtxn)?;

View File

@@ -418,7 +418,7 @@ where
} }
// needs to be dropped to avoid channel waiting lock. // needs to be dropped to avoid channel waiting lock.
drop(lmdb_writer_sx) drop(lmdb_writer_sx);
}); });
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
@@ -435,6 +435,8 @@ where
let mut word_docids = None; let mut word_docids = None;
let mut exact_word_docids = None; let mut exact_word_docids = None;
let mut dimension = None;
for result in lmdb_writer_rx { for result in lmdb_writer_rx {
if (self.should_abort)() { if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation)); return Err(Error::InternalError(InternalError::AbortedIndexation));
@@ -464,6 +466,20 @@ where
word_position_docids = Some(cloneable_chunk); word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk) TypedChunk::WordPositionDocids(chunk)
} }
TypedChunk::VectorPoints {
expected_dimension,
remove_vectors,
embeddings,
manual_vectors,
} => {
dimension = Some(expected_dimension);
TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension,
manual_vectors,
}
}
otherwise => otherwise, otherwise => otherwise,
}; };
@@ -490,9 +506,6 @@ where
} }
} }
let writer = arroy::Writer::prepare(self.wtxn, self.index.vector_arroy, 0, 0)?;
writer.build(self.wtxn, &mut rand::rngs::StdRng::from_entropy(), None)?;
// We write the field distribution into the main database // We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?; self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@@ -500,6 +513,23 @@ where
self.index.put_primary_key(self.wtxn, &primary_key)?; self.index.put_primary_key(self.wtxn, &primary_key)?;
let number_of_documents = self.index.number_of_documents(self.wtxn)?; let number_of_documents = self.index.number_of_documents(self.wtxn)?;
if let Some(dimension) = dimension {
let wtxn = &mut *self.wtxn;
let vector_arroy = self.index.vector_arroy;
pool.install(|| {
/// FIXME: do for each embedder
let mut rng = rand::rngs::StdRng::from_entropy();
for k in 0..=u8::MAX {
let writer = arroy::Writer::prepare(wtxn, vector_arroy, k.into(), dimension)?;
if writer.is_empty(wtxn)? {
break;
}
writer.build(wtxn, &mut rng, None)?;
}
Result::Ok(())
})?;
}
self.execute_prefix_databases( self.execute_prefix_databases(
word_docids, word_docids,
exact_word_docids, exact_word_docids,

View File

@@ -8,9 +8,7 @@ use charabia::{Language, Script};
use grenad::MergerBuilder; use grenad::MergerBuilder;
use heed::types::Bytes; use heed::types::Bytes;
use heed::{PutFlags, RwTxn}; use heed::{PutFlags, RwTxn};
use log::error;
use obkv::{KvReader, KvWriter}; use obkv::{KvReader, KvWriter};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{
@@ -18,16 +16,12 @@ use super::helpers::{
valid_lmdb_key, CursorClonableMmap, valid_lmdb_key, CursorClonableMmap,
}; };
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::distance::NDotProductPoint;
use crate::error::UserError;
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS; use crate::index::db_name::DOCUMENTS;
use crate::index::Hnsw;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
use crate::update::{available_documents_ids, AvailableDocumentsIds};
use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError}; use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError};
pub(crate) enum TypedChunk { pub(crate) enum TypedChunk {
@@ -374,28 +368,28 @@ pub(crate) fn write_typed_chunk_into_index(
return Ok((RoaringBitmap::new(), is_merged_database)); return Ok((RoaringBitmap::new(), is_merged_database));
} }
let mut unavailable_vector_ids = index.unavailable_vector_ids(&wtxn)?;
/// FIXME: allow customizing distance /// FIXME: allow customizing distance
/// FIXME: allow customizing index let writers: std::result::Result<Vec<_>, _> = (0..=u8::MAX)
let writer = arroy::Writer::prepare(wtxn, index.vector_arroy, 0, expected_dimension)?; .map(|k| {
/// FIXME: allow customizing index and then do index << 8 + k
arroy::Writer::prepare(wtxn, index.vector_arroy, k.into(), expected_dimension)
})
.collect();
let writers = writers?;
// remove vectors for docids we want them removed // remove vectors for docids we want them removed
let mut cursor = remove_vectors.into_cursor()?; let mut cursor = remove_vectors.into_cursor()?;
while let Some((key, _)) = cursor.move_on_next()? { while let Some((key, _)) = cursor.move_on_next()? {
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
let Some(to_remove_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)? else { for writer in &writers {
continue; // Uses invariant: vectors are packed in the first writers.
}; if !writer.del_item(wtxn, docid)? {
unavailable_vector_ids -= to_remove_vector_ids; break;
}
for item in to_remove_vector_ids {
writer.del_item(wtxn, item)?;
} }
} }
let mut available_vector_ids =
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
// add generated embeddings // add generated embeddings
if let Some(embeddings) = embeddings { if let Some(embeddings) = embeddings {
let mut cursor = embeddings.into_cursor()?; let mut cursor = embeddings.into_cursor()?;
@@ -408,19 +402,10 @@ pub(crate) fn write_typed_chunk_into_index(
// code error if we somehow got the wrong dimension // code error if we somehow got the wrong dimension
.unwrap(); .unwrap();
let mut new_vector_ids = RoaringBitmap::new(); /// FIXME: detect overflow
for embedding in embeddings.iter() { for (embedding, writer) in embeddings.iter().zip(&writers) {
/// FIXME: error when you get over 9000 writer.add_item(wtxn, docid, embedding)?;
let next_vector_id = available_vector_ids.next().unwrap();
unavailable_vector_ids.insert(next_vector_id);
new_vector_ids.insert(next_vector_id);
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?;
writer.add_item(wtxn, next_vector_id, embedding)?;
} }
index.docid_vector_ids.put(wtxn, &docid, &new_vector_ids)?;
} }
} }
@@ -433,44 +418,52 @@ pub(crate) fn write_typed_chunk_into_index(
let vector_deladd_obkv = KvReaderDelAdd::new(value); let vector_deladd_obkv = KvReaderDelAdd::new(value);
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
let vector = pod_collect_to_vec(value); let vector: Vec<f32> = pod_collect_to_vec(value);
let Some(mut docid_vector_ids) = index.docid_vector_ids.get(&wtxn, &docid)?
else { let mut deleted_index = None;
error!("Unable to delete the vector: {:?}", vector); for (index, writer) in writers.iter().enumerate() {
continue; let Some(candidate) = writer.item_vector(&wtxn, docid)? else {
}; // uses invariant: vectors are packed in the first writers.
for item in docid_vector_ids {
/// FIXME: comparing the vectors by equality is inefficient, and dangerous by perfect equality
let candidate = writer.item_vector(&wtxn, item)?.expect("Inconsistent dbs");
if candidate == vector {
writer.del_item(wtxn, item)?;
unavailable_vector_ids.remove(item);
index.vector_id_docid.delete(wtxn, &item)?;
docid_vector_ids.remove(item);
break; break;
};
if candidate == vector {
writer.del_item(wtxn, docid)?;
deleted_index = Some(index);
}
}
// 🥲 enforce invariant: vectors are packed in the first writers.
if let Some(deleted_index) = deleted_index {
let mut last_index_with_a_vector = None;
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
let Some(candidate) = writer.item_vector(&wtxn, docid)? else {
break;
};
last_index_with_a_vector = Some((index, candidate));
}
if let Some((last_index, vector)) = last_index_with_a_vector {
// unwrap: computed the index from the list of writers
let writer = writers.get(last_index).unwrap();
writer.del_item(wtxn, docid)?;
writers.get(deleted_index).unwrap().add_item(wtxn, docid, &vector)?;
} }
} }
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?;
} }
let mut available_vector_ids =
AvailableDocumentsIds::from_documents_ids(&unavailable_vector_ids);
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
let vector = pod_collect_to_vec(value); let vector = pod_collect_to_vec(value);
let next_vector_id = available_vector_ids.next().unwrap();
writer.add_item(wtxn, next_vector_id, &vector)?; /// FIXME: detect overflow
unavailable_vector_ids.insert(next_vector_id); for writer in &writers {
index.vector_id_docid.put(wtxn, &next_vector_id, &docid)?; if !writer.contains_item(wtxn, docid)? {
let mut docid_vector_ids = writer.add_item(wtxn, docid, &vector)?;
index.docid_vector_ids.get(&wtxn, &docid)?.unwrap_or_default(); break;
docid_vector_ids.insert(next_vector_id); }
index.docid_vector_ids.put(wtxn, &docid, &docid_vector_ids)?; }
} }
} }
log::debug!("There are {} entries in the arroy so far", unavailable_vector_ids.len()); log::debug!("There are 🤷‍♀️ entries in the arroy so far");
index.put_unavailable_vector_ids(wtxn, unavailable_vector_ids)?;
} }
TypedChunk::ScriptLanguageDocids(sl_map) => { TypedChunk::ScriptLanguageDocids(sl_map) => {
for (key, (deletion, addition)) in sl_map { for (key, (deletion, addition)) in sl_map {