Store the vectors in an HNSW in LMDB

This commit is contained in:
Clément Renault
2023-06-08 12:19:06 +02:00
parent 7ac2f1489d
commit 4571e512d2
8 changed files with 142 additions and 36 deletions

16
milli/src/dot_product.rs Normal file
View File

@ -0,0 +1,16 @@
use serde::{Deserialize, Serialize};
use space::Metric;
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct DotProduct;
impl Metric<Vec<f32>> for DotProduct {
type Unit = u32;
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}

View File

@ -8,10 +8,12 @@ use charabia::{Language, Script};
use heed::flags::Flags;
use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
use rand_pcg::Pcg32;
use roaring::RoaringBitmap;
use rstar::RTree;
use time::OffsetDateTime;
use crate::dot_product::DotProduct;
use crate::error::{InternalError, UserError};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@ -26,6 +28,9 @@ use crate::{
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
};
/// The HNSW data-structure that we serialize, fill and search in.
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
@ -42,6 +47,7 @@ pub mod main_key {
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -86,6 +92,7 @@ pub mod db_name {
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
pub const DOCUMENTS: &str = "documents";
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
}
@ -149,6 +156,9 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps a vector id to the document id that have it.
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
/// Maps the document id to the document as an obkv store.
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
@ -162,7 +172,7 @@ impl Index {
) -> Result<Index> {
use db_name::*;
options.max_dbs(23);
options.max_dbs(24);
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?;
@ -198,11 +208,11 @@ impl Index {
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids =
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
@ -231,6 +241,7 @@ impl Index {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
})
}
@ -502,6 +513,26 @@ impl Index {
}
}
/* vector HNSW */
/// Writes the provided `hnsw`.
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
}
/// Delete the `hnsw`.
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
}
/// Returns the `hnsw`.
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
Some(hnsw) => Ok(Some(hnsw)),
None => Ok(None),
}
}
/* field distribution */
/// Writes the field distribution which associates every field name with
@ -1466,9 +1497,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2
"###
);
@ -1486,9 +1517,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
age 1 |
id 2 |
name 2 |
age 1
id 2
name 2
"###
);
@ -1502,9 +1533,9 @@ pub(crate) mod tests {
db_snap!(index, field_distribution,
@r###"
has_dog 1 |
id 2 |
name 2 |
has_dog 1
id 2
name 2
"###
);
}

View File

@ -10,6 +10,7 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod dot_product;
mod error;
mod external_documents_ids;
pub mod facet;

View File

@ -39,6 +39,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
} = self.index;
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
@ -95,6 +97,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
documents.clear(self.wtxn)?;
Ok(number_of_documents)

View File

@ -240,6 +240,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
vector_id_docid,
documents,
} = self.index;
// Remove from the documents database

View File

@ -4,10 +4,12 @@ use std::convert::TryInto;
use std::fs::File;
use std::io;
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap;
use super::helpers::{
@ -17,7 +19,7 @@ use super::{ClonableMmap, MergeFn};
use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -223,27 +225,19 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
// let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
// let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
// let mut cursor = geo_points.into_cursor()?;
// while let Some((key, value)) = cursor.move_on_next()? {
// // convert the key back to a u32 (4 bytes)
// let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// // convert the latitude and longitude back to a f64 (8 bytes)
// let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
// let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
// let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
// let xyz_point = lat_lng_to_xyz(&point);
// rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
// geo_faceted_docids.insert(docid);
// }
// index.put_geo_rtree(wtxn, &rtree)?;
// index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
todo!("index vector points")
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// convert the vector back to a Vec<f32>
let vector: Vec<f32> = pod_collect_to_vec(value);
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
}
index.put_vector_hnsw(wtxn, &hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();