mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Extract the vectors from the documents
This commit is contained in:
		| @@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|     faceted_fields: HashSet<FieldId>, | ||||
|     primary_key_id: FieldId, | ||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||
|     vector_field_id: Option<FieldId>, | ||||
|     stop_words: Option<fst::Set<&[u8]>>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
|     exact_attributes: HashSet<FieldId>, | ||||
| @@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|                     &faceted_fields, | ||||
|                     primary_key_id, | ||||
|                     geo_fields_ids, | ||||
|                     vector_field_id, | ||||
|                     &stop_words, | ||||
|                     max_positions_per_attributes, | ||||
|                 ) | ||||
| @@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data( | ||||
|     faceted_fields: &HashSet<FieldId>, | ||||
|     primary_key_id: FieldId, | ||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||
|     vector_field_id: Option<FieldId>, | ||||
|     stop_words: &Option<fst::Set<&[u8]>>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
| ) -> Result<( | ||||
| @@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data( | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     if let Some(vector_field_id) = vector_field_id { | ||||
|         let documents_chunk_cloned = flattened_documents_chunk.clone(); | ||||
|         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id); | ||||
|             let _ = match result { | ||||
|                 Ok(vector_points) => { | ||||
|                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) | ||||
|                 } | ||||
|                 Err(error) => lmdb_writer_sx_cloned.send(Err(error)), | ||||
|             }; | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||
|         rayon::join( | ||||
|             || { | ||||
|   | ||||
| @@ -304,6 +304,8 @@ where | ||||
|             } | ||||
|             None => None, | ||||
|         }; | ||||
|         // get the fid of the `_vector` field. | ||||
|         let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector"); | ||||
|  | ||||
|         let stop_words = self.index.stop_words(self.wtxn)?; | ||||
|         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; | ||||
| @@ -340,6 +342,7 @@ where | ||||
|                     faceted_fields, | ||||
|                     primary_key_id, | ||||
|                     geo_fields_ids, | ||||
|                     vector_field_id, | ||||
|                     stop_words, | ||||
|                     max_positions_per_attributes, | ||||
|                     exact_attributes, | ||||
|   | ||||
| @@ -38,6 +38,7 @@ pub(crate) enum TypedChunk { | ||||
|     FieldIdFacetIsNullDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetIsEmptyDocids(grenad::Reader<File>), | ||||
|     GeoPoints(grenad::Reader<File>), | ||||
|     VectorPoints(grenad::Reader<File>), | ||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), | ||||
| } | ||||
|  | ||||
| @@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|             index.put_geo_rtree(wtxn, &rtree)?; | ||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||
|         } | ||||
|         TypedChunk::VectorPoints(vector_points) => { | ||||
|             // let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); | ||||
|             // let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; | ||||
|  | ||||
|             // let mut cursor = geo_points.into_cursor()?; | ||||
|             // while let Some((key, value)) = cursor.move_on_next()? { | ||||
|             //     // convert the key back to a u32 (4 bytes) | ||||
|             //     let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); | ||||
|  | ||||
|             //     // convert the latitude and longitude back to a f64 (8 bytes) | ||||
|             //     let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); | ||||
|             //     let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); | ||||
|             //     let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; | ||||
|             //     let xyz_point = lat_lng_to_xyz(&point); | ||||
|  | ||||
|             //     rtree.insert(GeoPoint::new(xyz_point, (docid, point))); | ||||
|             //     geo_faceted_docids.insert(docid); | ||||
|             // } | ||||
|             // index.put_geo_rtree(wtxn, &rtree)?; | ||||
|             // index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||
|  | ||||
|             todo!("index vector points") | ||||
|         } | ||||
|         TypedChunk::ScriptLanguageDocids(hash_pair) => { | ||||
|             let mut buffer = Vec::new(); | ||||
|             for (key, value) in hash_pair { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user