Extract the vectors from the documents

2025-10-26 05:26:27 +00:00 · 2023-06-08 11:51:55 +02:00
parent 34349faeae
commit 7ac2f1489d
3 changed files with 44 additions and 0 deletions
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents(
    faceted_fields: HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vector_field_id: Option<FieldId>,
    stop_words: Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
    exact_attributes: HashSet<FieldId>,
@@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
                    vector_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vector_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data(
        });
    }
    if let Some(vector_field_id) = vector_field_id {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
            let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
                }
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
            };
        });
    }
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -304,6 +304,8 @@ where
            }
            None => None,
        };
        // get the fid of the `_vector` field.
        let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
        let stop_words = self.index.stop_words(self.wtxn)?;
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
@@ -340,6 +342,7 @@ where
                    faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
                    vector_field_id,
                    stop_words,
                    max_positions_per_attributes,
                    exact_attributes,
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -38,6 +38,7 @@ pub(crate) enum TypedChunk {
    FieldIdFacetIsNullDocids(grenad::Reader<File>),
    FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
    GeoPoints(grenad::Reader<File>),
    VectorPoints(grenad::Reader<File>),
    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
 }
@@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index(
            index.put_geo_rtree(wtxn, &rtree)?;
            index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
        }
        TypedChunk::VectorPoints(vector_points) => {
            // let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
            // let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
            // let mut cursor = geo_points.into_cursor()?;
            // while let Some((key, value)) = cursor.move_on_next()? {
            //     // convert the key back to a u32 (4 bytes)
            //     let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
            //     // convert the latitude and longitude back to a f64 (8 bytes)
            //     let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
            //     let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
            //     let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
            //     let xyz_point = lat_lng_to_xyz(&point);
            //     rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
            //     geo_faceted_docids.insert(docid);
            // }
            // index.put_geo_rtree(wtxn, &rtree)?;
            // index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
            todo!("index vector points")
        }
        TypedChunk::ScriptLanguageDocids(hash_pair) => {
            let mut buffer = Vec::new();
            for (key, value) in hash_pair {