mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Extract and index data
This commit is contained in:
		| @@ -1,9 +1,9 @@ | |||||||
| use std::collections::HashSet; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::{io, mem, str}; | use std::{io, mem, str}; | ||||||
|  |  | ||||||
| use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; | use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| @@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|     stop_words: Option<&fst::Set<&[u8]>>, |     stop_words: Option<&fst::Set<&[u8]>>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<(RoaringBitmap, grenad::Reader<File>)> { | ) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> { | ||||||
|     let max_positions_per_attributes = max_positions_per_attributes |     let max_positions_per_attributes = max_positions_per_attributes | ||||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); |         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
|     let mut documents_ids = RoaringBitmap::new(); |     let mut documents_ids = RoaringBitmap::new(); | ||||||
|  |     let mut script_language_pair = HashMap::new(); | ||||||
|     let mut docid_word_positions_sorter = create_sorter( |     let mut docid_word_positions_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
|         concat_u32s_array, |         concat_u32s_array, | ||||||
| @@ -70,6 +71,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); |                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||||
|  |  | ||||||
|                     for (index, token) in tokens { |                     for (index, token) in tokens { | ||||||
|  |                         let script = token.script; | ||||||
|  |                         let language = token.language.unwrap_or_default(); | ||||||
|  |                         let entry = script_language_pair | ||||||
|  |                             .entry((script, language)) | ||||||
|  |                             .or_insert_with(RoaringBitmap::new); | ||||||
|  |                         entry.push(document_id); | ||||||
|                         let token = token.lemma().trim(); |                         let token = token.lemma().trim(); | ||||||
|                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { |                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||||
|                             key_buffer.truncate(mem::size_of::<u32>()); |                             key_buffer.truncate(mem::size_of::<u32>()); | ||||||
| @@ -88,7 +95,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) |     sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair)) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Transform a JSON value into a string that can be indexed. | /// Transform a JSON value into a string that can be indexed. | ||||||
|   | |||||||
| @@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data( | |||||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = |     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||||
|         rayon::join( |         rayon::join( | ||||||
|             || { |             || { | ||||||
|                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( |                 let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( | ||||||
|                     flattened_documents_chunk.clone(), |                     flattened_documents_chunk.clone(), | ||||||
|                     indexer, |                     indexer, | ||||||
|                     searchable_fields, |                     searchable_fields, | ||||||
| @@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data( | |||||||
|                 let _ = lmdb_writer_sx |                 let _ = lmdb_writer_sx | ||||||
|                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); |                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); | ||||||
|  |  | ||||||
|  |                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); | ||||||
|  |  | ||||||
|                 Ok(docid_word_positions_chunk) |                 Ok(docid_word_positions_chunk) | ||||||
|             }, |             }, | ||||||
|             || { |             || { | ||||||
|   | |||||||
| @@ -1,8 +1,10 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  | use std::collections::HashMap; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io; | use std::io; | ||||||
|  |  | ||||||
|  | use charabia::{Language, Script}; | ||||||
| use grenad::MergerBuilder; | use grenad::MergerBuilder; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use heed::{BytesDecode, RwTxn}; | use heed::{BytesDecode, RwTxn}; | ||||||
| @@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn}; | |||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
| use crate::update::facet::FacetsUpdate; | use crate::update::facet::FacetsUpdate; | ||||||
| use crate::update::index_documents::helpers::as_cloneable_grenad; | use crate::update::index_documents::helpers::as_cloneable_grenad; | ||||||
| use crate::{ | use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz}; | ||||||
|     lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, |  | ||||||
|     Result, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| pub(crate) enum TypedChunk { | pub(crate) enum TypedChunk { | ||||||
|     DocidWordPositions(grenad::Reader<CursorClonableMmap>), |     DocidWordPositions(grenad::Reader<CursorClonableMmap>), | ||||||
| @@ -38,6 +37,7 @@ pub(crate) enum TypedChunk { | |||||||
|     FieldIdFacetNumberDocids(grenad::Reader<File>), |     FieldIdFacetNumberDocids(grenad::Reader<File>), | ||||||
|     FieldIdFacetExistsDocids(grenad::Reader<File>), |     FieldIdFacetExistsDocids(grenad::Reader<File>), | ||||||
|     GeoPoints(grenad::Reader<File>), |     GeoPoints(grenad::Reader<File>), | ||||||
|  |     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Write typed chunk in the corresponding LMDB database of the provided index. | /// Write typed chunk in the corresponding LMDB database of the provided index. | ||||||
| @@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|             index.put_geo_rtree(wtxn, &rtree)?; |             index.put_geo_rtree(wtxn, &rtree)?; | ||||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; |             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||||
|         } |         } | ||||||
|  |         TypedChunk::ScriptLanguageDocids(hash_pair) => { | ||||||
|  |             let mut buffer = Vec::new(); | ||||||
|  |             for (key, value) in hash_pair { | ||||||
|  |                 buffer.clear(); | ||||||
|  |                 let final_value = match index.script_language_docids.get(wtxn, &key)? { | ||||||
|  |                     Some(db_values) => { | ||||||
|  |                         let mut db_value_buffer = Vec::new(); | ||||||
|  |                         serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; | ||||||
|  |                         let mut new_value_buffer = Vec::new(); | ||||||
|  |                         serialize_roaring_bitmap(&value, &mut new_value_buffer)?; | ||||||
|  |                         merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; | ||||||
|  |                         let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; | ||||||
|  |                         merged_db_values | ||||||
|  |                     } | ||||||
|  |                     None => value | ||||||
|  |                 }; | ||||||
|  |                 index.script_language_docids.put(wtxn, &key, &final_value)?; | ||||||
|  |             } | ||||||
|  |         }  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok((RoaringBitmap::new(), is_merged_database)) |     Ok((RoaringBitmap::new(), is_merged_database)) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user