mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Extract and index data
This commit is contained in:
		| @@ -1,9 +1,9 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::{io, mem, str}; | ||||
|  | ||||
| use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; | ||||
| use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
|  | ||||
| @@ -25,12 +25,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     stop_words: Option<&fst::Set<&[u8]>>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
| ) -> Result<(RoaringBitmap, grenad::Reader<File>)> { | ||||
| ) -> Result<(RoaringBitmap, grenad::Reader<File>, HashMap<(Script, Language), RoaringBitmap>)> { | ||||
|     let max_positions_per_attributes = max_positions_per_attributes | ||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
|     let mut script_language_pair = HashMap::new(); | ||||
|     let mut docid_word_positions_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Stable, | ||||
|         concat_u32s_array, | ||||
| @@ -70,6 +71,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||
|  | ||||
|                     for (index, token) in tokens { | ||||
|                         let script = token.script; | ||||
|                         let language = token.language.unwrap_or_default(); | ||||
|                         let entry = script_language_pair | ||||
|                             .entry((script, language)) | ||||
|                             .or_insert_with(RoaringBitmap::new); | ||||
|                         entry.push(document_id); | ||||
|                         let token = token.lemma().trim(); | ||||
|                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||
|                             key_buffer.truncate(mem::size_of::<u32>()); | ||||
| @@ -88,7 +95,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) | ||||
|     sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader, script_language_pair)) | ||||
| } | ||||
|  | ||||
| /// Transform a JSON value into a string that can be indexed. | ||||
|   | ||||
| @@ -257,7 +257,7 @@ fn send_and_extract_flattened_documents_data( | ||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||
|         rayon::join( | ||||
|             || { | ||||
|                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( | ||||
|                 let (documents_ids, docid_word_positions_chunk, script_language_pair) = extract_docid_word_positions( | ||||
|                     flattened_documents_chunk.clone(), | ||||
|                     indexer, | ||||
|                     searchable_fields, | ||||
| @@ -274,6 +274,8 @@ fn send_and_extract_flattened_documents_data( | ||||
|                 let _ = lmdb_writer_sx | ||||
|                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); | ||||
|  | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); | ||||
|  | ||||
|                 Ok(docid_word_positions_chunk) | ||||
|             }, | ||||
|             || { | ||||
|   | ||||
| @@ -1,8 +1,10 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use charabia::{Language, Script}; | ||||
| use grenad::MergerBuilder; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::{BytesDecode, RwTxn}; | ||||
| @@ -16,10 +18,7 @@ use super::{ClonableMmap, MergeFn}; | ||||
| use crate::facet::FacetType; | ||||
| use crate::update::facet::FacetsUpdate; | ||||
| use crate::update::index_documents::helpers::as_cloneable_grenad; | ||||
| use crate::{ | ||||
|     lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, | ||||
|     Result, | ||||
| }; | ||||
| use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, lat_lng_to_xyz}; | ||||
|  | ||||
| pub(crate) enum TypedChunk { | ||||
|     DocidWordPositions(grenad::Reader<CursorClonableMmap>), | ||||
| @@ -38,6 +37,7 @@ pub(crate) enum TypedChunk { | ||||
|     FieldIdFacetNumberDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetExistsDocids(grenad::Reader<File>), | ||||
|     GeoPoints(grenad::Reader<File>), | ||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>) | ||||
| } | ||||
|  | ||||
| /// Write typed chunk in the corresponding LMDB database of the provided index. | ||||
| @@ -210,6 +210,25 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|             index.put_geo_rtree(wtxn, &rtree)?; | ||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||
|         } | ||||
|         TypedChunk::ScriptLanguageDocids(hash_pair) => { | ||||
|             let mut buffer = Vec::new(); | ||||
|             for (key, value) in hash_pair { | ||||
|                 buffer.clear(); | ||||
|                 let final_value = match index.script_language_docids.get(wtxn, &key)? { | ||||
|                     Some(db_values) => { | ||||
|                         let mut db_value_buffer = Vec::new(); | ||||
|                         serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; | ||||
|                         let mut new_value_buffer = Vec::new(); | ||||
|                         serialize_roaring_bitmap(&value, &mut new_value_buffer)?; | ||||
|                         merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; | ||||
|                         let merged_db_values = RoaringBitmap::deserialize_from(&buffer[..])?; | ||||
|                         merged_db_values | ||||
|                     } | ||||
|                     None => value | ||||
|                 }; | ||||
|                 index.script_language_docids.put(wtxn, &key, &final_value)?; | ||||
|             } | ||||
|         }  | ||||
|     } | ||||
|  | ||||
|     Ok((RoaringBitmap::new(), is_merged_database)) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user