mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Extract the vectors from the documents
This commit is contained in:
		| @@ -47,6 +47,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|     faceted_fields: HashSet<FieldId>, |     faceted_fields: HashSet<FieldId>, | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|  |     vector_field_id: Option<FieldId>, | ||||||
|     stop_words: Option<fst::Set<&[u8]>>, |     stop_words: Option<fst::Set<&[u8]>>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
|     exact_attributes: HashSet<FieldId>, |     exact_attributes: HashSet<FieldId>, | ||||||
| @@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                     &faceted_fields, |                     &faceted_fields, | ||||||
|                     primary_key_id, |                     primary_key_id, | ||||||
|                     geo_fields_ids, |                     geo_fields_ids, | ||||||
|  |                     vector_field_id, | ||||||
|                     &stop_words, |                     &stop_words, | ||||||
|                     max_positions_per_attributes, |                     max_positions_per_attributes, | ||||||
|                 ) |                 ) | ||||||
| @@ -281,6 +283,7 @@ fn send_and_extract_flattened_documents_data( | |||||||
|     faceted_fields: &HashSet<FieldId>, |     faceted_fields: &HashSet<FieldId>, | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|  |     vector_field_id: Option<FieldId>, | ||||||
|     stop_words: &Option<fst::Set<&[u8]>>, |     stop_words: &Option<fst::Set<&[u8]>>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<( | ) -> Result<( | ||||||
| @@ -309,6 +312,20 @@ fn send_and_extract_flattened_documents_data( | |||||||
|         }); |         }); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if let Some(vector_field_id) = vector_field_id { | ||||||
|  |         let documents_chunk_cloned = flattened_documents_chunk.clone(); | ||||||
|  |         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||||
|  |         rayon::spawn(move || { | ||||||
|  |             let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id); | ||||||
|  |             let _ = match result { | ||||||
|  |                 Ok(vector_points) => { | ||||||
|  |                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) | ||||||
|  |                 } | ||||||
|  |                 Err(error) => lmdb_writer_sx_cloned.send(Err(error)), | ||||||
|  |             }; | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = |     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||||
|         rayon::join( |         rayon::join( | ||||||
|             || { |             || { | ||||||
|   | |||||||
| @@ -304,6 +304,8 @@ where | |||||||
|             } |             } | ||||||
|             None => None, |             None => None, | ||||||
|         }; |         }; | ||||||
|  |         // get the fid of the `_vector` field. | ||||||
|  |         let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector"); | ||||||
|  |  | ||||||
|         let stop_words = self.index.stop_words(self.wtxn)?; |         let stop_words = self.index.stop_words(self.wtxn)?; | ||||||
|         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; |         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; | ||||||
| @@ -340,6 +342,7 @@ where | |||||||
|                     faceted_fields, |                     faceted_fields, | ||||||
|                     primary_key_id, |                     primary_key_id, | ||||||
|                     geo_fields_ids, |                     geo_fields_ids, | ||||||
|  |                     vector_field_id, | ||||||
|                     stop_words, |                     stop_words, | ||||||
|                     max_positions_per_attributes, |                     max_positions_per_attributes, | ||||||
|                     exact_attributes, |                     exact_attributes, | ||||||
|   | |||||||
| @@ -38,6 +38,7 @@ pub(crate) enum TypedChunk { | |||||||
|     FieldIdFacetIsNullDocids(grenad::Reader<File>), |     FieldIdFacetIsNullDocids(grenad::Reader<File>), | ||||||
|     FieldIdFacetIsEmptyDocids(grenad::Reader<File>), |     FieldIdFacetIsEmptyDocids(grenad::Reader<File>), | ||||||
|     GeoPoints(grenad::Reader<File>), |     GeoPoints(grenad::Reader<File>), | ||||||
|  |     VectorPoints(grenad::Reader<File>), | ||||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), |     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -221,6 +222,29 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|             index.put_geo_rtree(wtxn, &rtree)?; |             index.put_geo_rtree(wtxn, &rtree)?; | ||||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; |             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||||
|         } |         } | ||||||
|  |         TypedChunk::VectorPoints(vector_points) => { | ||||||
|  |             // let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); | ||||||
|  |             // let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; | ||||||
|  |  | ||||||
|  |             // let mut cursor = geo_points.into_cursor()?; | ||||||
|  |             // while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |             //     // convert the key back to a u32 (4 bytes) | ||||||
|  |             //     let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); | ||||||
|  |  | ||||||
|  |             //     // convert the latitude and longitude back to a f64 (8 bytes) | ||||||
|  |             //     let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); | ||||||
|  |             //     let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); | ||||||
|  |             //     let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; | ||||||
|  |             //     let xyz_point = lat_lng_to_xyz(&point); | ||||||
|  |  | ||||||
|  |             //     rtree.insert(GeoPoint::new(xyz_point, (docid, point))); | ||||||
|  |             //     geo_faceted_docids.insert(docid); | ||||||
|  |             // } | ||||||
|  |             // index.put_geo_rtree(wtxn, &rtree)?; | ||||||
|  |             // index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||||
|  |  | ||||||
|  |             todo!("index vector points") | ||||||
|  |         } | ||||||
|         TypedChunk::ScriptLanguageDocids(hash_pair) => { |         TypedChunk::ScriptLanguageDocids(hash_pair) => { | ||||||
|             let mut buffer = Vec::new(); |             let mut buffer = Vec::new(); | ||||||
|             for (key, value) in hash_pair { |             for (key, value) in hash_pair { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user