mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Refactor Document indexing process (searchables)
**Changes:** The searchable database extraction is now relying on the AttributePatterns and FieldIdMapWithMetadata to match the field to extract. Remove the SearchableExtractor trait to make the code less complex. **Impact:** - Document Addition/modification searchable indexing - Document deletion searchable indexing
This commit is contained in:
		| @@ -5,8 +5,8 @@ use std::ops::DerefMut as _; | ||||
|  | ||||
| use bumpalo::collections::vec::Vec as BumpVec; | ||||
| use bumpalo::Bump; | ||||
| use heed::RoTxn; | ||||
|  | ||||
| use super::match_searchable_field; | ||||
| use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; | ||||
| use crate::update::new::extract::cache::BalancedCaches; | ||||
| use crate::update::new::extract::perm_json_p::contained_in; | ||||
| @@ -17,8 +17,7 @@ use crate::update::new::ref_cell_ext::RefCellExt as _; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::update::GrenadParameters; | ||||
| use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; | ||||
| use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; | ||||
|  | ||||
| const MAX_COUNTED_WORDS: usize = 30; | ||||
|  | ||||
| @@ -207,9 +206,10 @@ impl<'extractor> WordDocidsCaches<'extractor> { | ||||
| } | ||||
|  | ||||
| pub struct WordDocidsExtractorData<'a> { | ||||
|     tokenizer: &'a DocumentTokenizer<'a>, | ||||
|     grenad_parameters: &'a GrenadParameters, | ||||
|     tokenizer: DocumentTokenizer<'a>, | ||||
|     max_memory_by_thread: Option<usize>, | ||||
|     buckets: usize, | ||||
|     searchable_attributes: Option<Vec<&'a str>>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { | ||||
| @@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { | ||||
|     fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> { | ||||
|         Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( | ||||
|             self.buckets, | ||||
|             self.grenad_parameters.max_memory_by_thread(), | ||||
|             self.max_memory_by_thread, | ||||
|             extractor_alloc, | ||||
|         )))) | ||||
|     } | ||||
| @@ -230,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { | ||||
|     ) -> Result<()> { | ||||
|         for change in changes { | ||||
|             let change = change?; | ||||
|             WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; | ||||
|             WordDocidsExtractors::extract_document_change( | ||||
|                 context, | ||||
|                 &self.tokenizer, | ||||
|                 self.searchable_attributes.as_deref(), | ||||
|                 change, | ||||
|             )?; | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
| @@ -248,52 +253,42 @@ impl WordDocidsExtractors { | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|     { | ||||
|         let index = indexing_context.index; | ||||
|         let rtxn = index.read_txn()?; | ||||
|  | ||||
|         let stop_words = index.stop_words(&rtxn)?; | ||||
|         let allowed_separators = index.allowed_separators(&rtxn)?; | ||||
|         // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs | ||||
|         let rtxn = indexing_context.index.read_txn()?; | ||||
|         let stop_words = indexing_context.index.stop_words(&rtxn)?; | ||||
|         let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; | ||||
|         let allowed_separators: Option<Vec<_>> = | ||||
|             allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let dictionary = index.dictionary(&rtxn)?; | ||||
|         let dictionary = indexing_context.index.dictionary(&rtxn)?; | ||||
|         let dictionary: Option<Vec<_>> = | ||||
|             dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let builder = tokenizer_builder( | ||||
|         let mut builder = tokenizer_builder( | ||||
|             stop_words.as_ref(), | ||||
|             allowed_separators.as_deref(), | ||||
|             dictionary.as_deref(), | ||||
|         ); | ||||
|         let tokenizer = builder.into_tokenizer(); | ||||
|  | ||||
|         let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; | ||||
|         let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; | ||||
|         let tokenizer = builder.build(); | ||||
|         let localized_attributes_rules = | ||||
|             index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||
|  | ||||
|             indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||
|         let document_tokenizer = DocumentTokenizer { | ||||
|             tokenizer: &tokenizer, | ||||
|             attribute_to_extract: attributes_to_extract.as_deref(), | ||||
|             attribute_to_skip: attributes_to_skip.as_slice(), | ||||
|             localized_attributes_rules: &localized_attributes_rules, | ||||
|             max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, | ||||
|         }; | ||||
|  | ||||
|         let extractor_data = WordDocidsExtractorData { | ||||
|             tokenizer: document_tokenizer, | ||||
|             max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), | ||||
|             buckets: rayon::current_num_threads(), | ||||
|             searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, | ||||
|         }; | ||||
|         let datastore = ThreadLocal::new(); | ||||
|  | ||||
|         { | ||||
|             let span = | ||||
|                 tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); | ||||
|             let _entered = span.enter(); | ||||
|  | ||||
|             let extractor = WordDocidsExtractorData { | ||||
|                 tokenizer: &document_tokenizer, | ||||
|                 grenad_parameters: indexing_context.grenad_parameters, | ||||
|                 buckets: rayon::current_num_threads(), | ||||
|             }; | ||||
|  | ||||
|             extract( | ||||
|                 document_changes, | ||||
|                 &extractor, | ||||
|                 &extractor_data, | ||||
|                 indexing_context, | ||||
|                 extractor_allocs, | ||||
|                 &datastore, | ||||
| @@ -312,6 +307,7 @@ impl WordDocidsExtractors { | ||||
|     fn extract_document_change( | ||||
|         context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>, | ||||
|         document_tokenizer: &DocumentTokenizer, | ||||
|         searchable_attributes: Option<&[&str]>, | ||||
|         document_change: DocumentChange, | ||||
|     ) -> Result<()> { | ||||
|         let index = &context.index; | ||||
| @@ -345,7 +341,9 @@ impl WordDocidsExtractors { | ||||
|             } | ||||
|             DocumentChange::Update(inner) => { | ||||
|                 if !inner.has_changed_for_fields( | ||||
|                     document_tokenizer.attribute_to_extract, | ||||
|                     &mut |field_name: &str| { | ||||
|                         match_searchable_field(field_name, searchable_attributes) | ||||
|                     }, | ||||
|                     &context.rtxn, | ||||
|                     context.index, | ||||
|                     context.db_fields_ids_map, | ||||
| @@ -408,15 +406,4 @@ impl WordDocidsExtractors { | ||||
|         let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); | ||||
|         cached_sorter.flush_fid_word_count(&mut buffer) | ||||
|     } | ||||
|  | ||||
|     fn attributes_to_extract<'a>( | ||||
|         rtxn: &'a RoTxn, | ||||
|         index: &'a Index, | ||||
|     ) -> Result<Option<Vec<&'a str>>> { | ||||
|         index.user_defined_searchable_fields(rtxn).map_err(Into::into) | ||||
|     } | ||||
|  | ||||
|     fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> { | ||||
|         Ok(Vec::new()) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -2,30 +2,114 @@ use std::cell::RefCell; | ||||
| use std::collections::VecDeque; | ||||
| use std::rc::Rc; | ||||
|  | ||||
| use heed::RoTxn; | ||||
| use bumpalo::Bump; | ||||
|  | ||||
| use super::tokenize_document::DocumentTokenizer; | ||||
| use super::SearchableExtractor; | ||||
| use super::match_searchable_field; | ||||
| use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; | ||||
| use crate::proximity::{index_proximity, MAX_DISTANCE}; | ||||
| use crate::update::new::document::Document; | ||||
| use crate::update::new::extract::cache::BalancedCaches; | ||||
| use crate::update::new::indexer::document_changes::DocumentChangeContext; | ||||
| use crate::update::new::indexer::document_changes::{ | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, | ||||
| }; | ||||
| use crate::update::new::ref_cell_ext::RefCellExt as _; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; | ||||
| use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; | ||||
|  | ||||
| pub struct WordPairProximityDocidsExtractorData<'a> { | ||||
|     tokenizer: DocumentTokenizer<'a>, | ||||
|     searchable_attributes: Option<Vec<&'a str>>, | ||||
|     max_memory_by_thread: Option<usize>, | ||||
|     buckets: usize, | ||||
| } | ||||
|  | ||||
| impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> { | ||||
|     type Data = RefCell<BalancedCaches<'extractor>>; | ||||
|  | ||||
|     fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> { | ||||
|         Ok(RefCell::new(BalancedCaches::new_in( | ||||
|             self.buckets, | ||||
|             self.max_memory_by_thread, | ||||
|             extractor_alloc, | ||||
|         ))) | ||||
|     } | ||||
|  | ||||
|     fn process<'doc>( | ||||
|         &self, | ||||
|         changes: impl Iterator<Item = Result<DocumentChange<'doc>>>, | ||||
|         context: &DocumentChangeContext<Self::Data>, | ||||
|     ) -> Result<()> { | ||||
|         for change in changes { | ||||
|             let change = change?; | ||||
|             WordPairProximityDocidsExtractor::extract_document_change( | ||||
|                 context, | ||||
|                 &self.tokenizer, | ||||
|                 self.searchable_attributes.as_deref(), | ||||
|                 change, | ||||
|             )?; | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct WordPairProximityDocidsExtractor; | ||||
|  | ||||
| impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|     fn attributes_to_extract<'a>( | ||||
|         rtxn: &'a RoTxn, | ||||
|         index: &'a Index, | ||||
|     ) -> Result<Option<Vec<&'a str>>> { | ||||
|         index.user_defined_searchable_fields(rtxn).map_err(Into::into) | ||||
|     } | ||||
| impl WordPairProximityDocidsExtractor { | ||||
|     pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|     { | ||||
|         // Warning: this is duplicated code from extract_word_docids.rs | ||||
|         let rtxn = indexing_context.index.read_txn()?; | ||||
|         let stop_words = indexing_context.index.stop_words(&rtxn)?; | ||||
|         let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; | ||||
|         let allowed_separators: Option<Vec<_>> = | ||||
|             allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let dictionary = indexing_context.index.dictionary(&rtxn)?; | ||||
|         let dictionary: Option<Vec<_>> = | ||||
|             dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let mut builder = tokenizer_builder( | ||||
|             stop_words.as_ref(), | ||||
|             allowed_separators.as_deref(), | ||||
|             dictionary.as_deref(), | ||||
|         ); | ||||
|         let tokenizer = builder.build(); | ||||
|         let localized_attributes_rules = | ||||
|             indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||
|         let document_tokenizer = DocumentTokenizer { | ||||
|             tokenizer: &tokenizer, | ||||
|             localized_attributes_rules: &localized_attributes_rules, | ||||
|             max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, | ||||
|         }; | ||||
|         let extractor_data = WordPairProximityDocidsExtractorData { | ||||
|             tokenizer: document_tokenizer, | ||||
|             searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, | ||||
|             max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), | ||||
|             buckets: rayon::current_num_threads(), | ||||
|         }; | ||||
|         let datastore = ThreadLocal::new(); | ||||
|         { | ||||
|             let span = | ||||
|                 tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); | ||||
|             let _entered = span.enter(); | ||||
|             extract( | ||||
|                 document_changes, | ||||
|                 &extractor_data, | ||||
|                 indexing_context, | ||||
|                 extractor_allocs, | ||||
|                 &datastore, | ||||
|                 step, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|     fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> { | ||||
|         Ok(Vec::new()) | ||||
|         Ok(datastore.into_iter().map(RefCell::into_inner).collect()) | ||||
|     } | ||||
|  | ||||
|     // This method is reimplemented to count the number of words in the document in each field | ||||
| @@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|     fn extract_document_change( | ||||
|         context: &DocumentChangeContext<RefCell<BalancedCaches>>, | ||||
|         document_tokenizer: &DocumentTokenizer, | ||||
|         searchable_attributes: Option<&[&str]>, | ||||
|         document_change: DocumentChange, | ||||
|     ) -> Result<()> { | ||||
|         let doc_alloc = &context.doc_alloc; | ||||
| @@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|             } | ||||
|             DocumentChange::Update(inner) => { | ||||
|                 if !inner.has_changed_for_fields( | ||||
|                     document_tokenizer.attribute_to_extract, | ||||
|                     &mut |field_name: &str| { | ||||
|                         match_searchable_field(field_name, searchable_attributes) | ||||
|                     }, | ||||
|                     rtxn, | ||||
|                     index, | ||||
|                     context.db_fields_ids_map, | ||||
|   | ||||
| @@ -2,145 +2,28 @@ mod extract_word_docids; | ||||
| mod extract_word_pair_proximity_docids; | ||||
| mod tokenize_document; | ||||
|  | ||||
| use std::cell::RefCell; | ||||
| use std::marker::PhantomData; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; | ||||
| pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; | ||||
| use heed::RoTxn; | ||||
| use tokenize_document::{tokenizer_builder, DocumentTokenizer}; | ||||
|  | ||||
| use super::cache::BalancedCaches; | ||||
| use super::DocidsExtractor; | ||||
| use crate::update::new::indexer::document_changes::{ | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, | ||||
| }; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::update::GrenadParameters; | ||||
| use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; | ||||
| use crate::attribute_patterns::{match_field_legacy, PatternMatch}; | ||||
|  | ||||
| pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { | ||||
|     tokenizer: &'a DocumentTokenizer<'a>, | ||||
|     grenad_parameters: &'a GrenadParameters, | ||||
|     buckets: usize, | ||||
|     _ex: PhantomData<EX>, | ||||
| } | ||||
| pub fn match_searchable_field( | ||||
|     field_name: &str, | ||||
|     searchable_fields: Option<&[&str]>, | ||||
| ) -> PatternMatch { | ||||
|     let Some(searchable_fields) = searchable_fields else { | ||||
|         // If no searchable fields are provided, consider all fields as searchable | ||||
|         return PatternMatch::Match; | ||||
|     }; | ||||
|  | ||||
| impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> | ||||
|     for SearchableExtractorData<'a, EX> | ||||
| { | ||||
|     type Data = RefCell<BalancedCaches<'extractor>>; | ||||
|  | ||||
|     fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> { | ||||
|         Ok(RefCell::new(BalancedCaches::new_in( | ||||
|             self.buckets, | ||||
|             self.grenad_parameters.max_memory_by_thread(), | ||||
|             extractor_alloc, | ||||
|         ))) | ||||
|     } | ||||
|  | ||||
|     fn process<'doc>( | ||||
|         &self, | ||||
|         changes: impl Iterator<Item = Result<DocumentChange<'doc>>>, | ||||
|         context: &DocumentChangeContext<Self::Data>, | ||||
|     ) -> Result<()> { | ||||
|         for change in changes { | ||||
|             let change = change?; | ||||
|             EX::extract_document_change(context, self.tokenizer, change)?; | ||||
|     let mut selection = PatternMatch::NoMatch; | ||||
|     for pattern in searchable_fields { | ||||
|         match match_field_legacy(pattern, field_name) { | ||||
|             PatternMatch::Match => return PatternMatch::Match, | ||||
|             PatternMatch::Parent => selection = PatternMatch::Parent, | ||||
|             PatternMatch::NoMatch => (), | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub trait SearchableExtractor: Sized + Sync { | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|     { | ||||
|         let rtxn = indexing_context.index.read_txn()?; | ||||
|         let stop_words = indexing_context.index.stop_words(&rtxn)?; | ||||
|         let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; | ||||
|         let allowed_separators: Option<Vec<_>> = | ||||
|             allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let dictionary = indexing_context.index.dictionary(&rtxn)?; | ||||
|         let dictionary: Option<Vec<_>> = | ||||
|             dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|         let mut builder = tokenizer_builder( | ||||
|             stop_words.as_ref(), | ||||
|             allowed_separators.as_deref(), | ||||
|             dictionary.as_deref(), | ||||
|         ); | ||||
|         let tokenizer = builder.build(); | ||||
|  | ||||
|         let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; | ||||
|         let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; | ||||
|         let localized_attributes_rules = | ||||
|             indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||
|  | ||||
|         let document_tokenizer = DocumentTokenizer { | ||||
|             tokenizer: &tokenizer, | ||||
|             attribute_to_extract: attributes_to_extract.as_deref(), | ||||
|             attribute_to_skip: attributes_to_skip.as_slice(), | ||||
|             localized_attributes_rules: &localized_attributes_rules, | ||||
|             max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, | ||||
|         }; | ||||
|  | ||||
|         let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData { | ||||
|             tokenizer: &document_tokenizer, | ||||
|             grenad_parameters: indexing_context.grenad_parameters, | ||||
|             buckets: rayon::current_num_threads(), | ||||
|             _ex: PhantomData, | ||||
|         }; | ||||
|  | ||||
|         let datastore = ThreadLocal::new(); | ||||
|  | ||||
|         { | ||||
|             let span = | ||||
|                 tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); | ||||
|             let _entered = span.enter(); | ||||
|             extract( | ||||
|                 document_changes, | ||||
|                 &extractor_data, | ||||
|                 indexing_context, | ||||
|                 extractor_allocs, | ||||
|                 &datastore, | ||||
|                 step, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         Ok(datastore.into_iter().map(RefCell::into_inner).collect()) | ||||
|     } | ||||
|  | ||||
|     fn extract_document_change( | ||||
|         context: &DocumentChangeContext<RefCell<BalancedCaches>>, | ||||
|         document_tokenizer: &DocumentTokenizer, | ||||
|         document_change: DocumentChange, | ||||
|     ) -> Result<()>; | ||||
|  | ||||
|     fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) | ||||
|         -> Result<Option<Vec<&'a str>>>; | ||||
|  | ||||
|     fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>; | ||||
| } | ||||
|  | ||||
| impl<T: SearchableExtractor> DocidsExtractor for T { | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|     { | ||||
|         Self::run_extraction(document_changes, indexing_context, extractor_allocs, step) | ||||
|     } | ||||
|     selection | ||||
| } | ||||
|   | ||||
| @@ -3,9 +3,10 @@ use std::collections::HashMap; | ||||
| use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use crate::attribute_patterns::PatternMatch; | ||||
| use crate::update::new::document::Document; | ||||
| use crate::update::new::extract::perm_json_p::{ | ||||
|     seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, | ||||
|     seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, | ||||
| }; | ||||
| use crate::{ | ||||
|     FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, | ||||
| @@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8; | ||||
|  | ||||
| pub struct DocumentTokenizer<'a> { | ||||
|     pub tokenizer: &'a Tokenizer<'a>, | ||||
|     pub attribute_to_extract: Option<&'a [&'a str]>, | ||||
|     pub attribute_to_skip: &'a [&'a str], | ||||
|     pub localized_attributes_rules: &'a [LocalizedAttributesRule], | ||||
|     pub max_positions_per_attributes: u32, | ||||
| } | ||||
| @@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> { | ||||
|         token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, | ||||
|     ) -> Result<()> { | ||||
|         let mut field_position = HashMap::new(); | ||||
|         let mut tokenize_field = |field_name: &str, _depth, value: &Value| { | ||||
|             let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else { | ||||
|                 return Err(UserError::AttributeLimitReached.into()); | ||||
|             }; | ||||
|  | ||||
|             if meta.is_searchable() { | ||||
|                 self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?; | ||||
|             } | ||||
|  | ||||
|             // todo: should be a match on the field_name using `match_field_legacy` function, | ||||
|             // but for legacy reasons we iterate over all the fields to fill the field_id_map. | ||||
|             Ok(PatternMatch::Match) | ||||
|         }; | ||||
|  | ||||
|         for entry in document.iter_top_level_fields() { | ||||
|             let (field_name, value) = entry?; | ||||
|  | ||||
|             let mut tokenize_field = |field_name: &str, _depth, value: &Value| { | ||||
|                 let Some(field_id) = field_id_map.id_or_insert(field_name) else { | ||||
|                     return Err(UserError::AttributeLimitReached.into()); | ||||
|                 }; | ||||
|  | ||||
|                 if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) | ||||
|                     != Selection::Select | ||||
|                 { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|  | ||||
|                 let position = field_position | ||||
|                     .entry(field_id) | ||||
|                     .and_modify(|counter| *counter += MAX_DISTANCE) | ||||
|                     .or_insert(0); | ||||
|                 if *position >= self.max_positions_per_attributes { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|  | ||||
|                 let text; | ||||
|                 let tokens = match value { | ||||
|                     Value::Number(n) => { | ||||
|                         text = n.to_string(); | ||||
|                         self.tokenizer.tokenize(text.as_str()) | ||||
|                     } | ||||
|                     Value::Bool(b) => { | ||||
|                         text = b.to_string(); | ||||
|                         self.tokenizer.tokenize(text.as_str()) | ||||
|                     } | ||||
|                     Value::String(text) => { | ||||
|                         let locales = self | ||||
|                             .localized_attributes_rules | ||||
|                             .iter() | ||||
|                             .find(|rule| rule.match_str(field_name)) | ||||
|                             .map(|rule| rule.locales()); | ||||
|                         self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) | ||||
|                     } | ||||
|                     _ => return Ok(()), | ||||
|                 }; | ||||
|  | ||||
|                 // create an iterator of token with their positions. | ||||
|                 let tokens = process_tokens(*position, tokens) | ||||
|                     .take_while(|(p, _)| *p < self.max_positions_per_attributes); | ||||
|  | ||||
|                 for (index, token) in tokens { | ||||
|                     // keep a word only if it is not empty and fit in a LMDB key. | ||||
|                     let token = token.lemma().trim(); | ||||
|                     if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||
|                         *position = index; | ||||
|                         if let Ok(position) = (*position).try_into() { | ||||
|                             token_fn(field_name, field_id, position, token)?; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 Ok(()) | ||||
|             }; | ||||
|  | ||||
|             // parse json. | ||||
|             match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { | ||||
|                 Value::Object(object) => seek_leaf_values_in_object( | ||||
|                     &object, | ||||
|                     None, | ||||
|                     &[], | ||||
|                     field_name, | ||||
|                     Depth::OnBaseKey, | ||||
|                     &mut tokenize_field, | ||||
|                 )?, | ||||
|                 Value::Array(array) => seek_leaf_values_in_array( | ||||
|                     &array, | ||||
|                     None, | ||||
|                     &[], | ||||
|                     field_name, | ||||
|                     Depth::OnBaseKey, | ||||
|                     &mut tokenize_field, | ||||
|                 )?, | ||||
|                 value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, | ||||
|                 value => { | ||||
|                     tokenize_field(field_name, Depth::OnBaseKey, &value)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn tokenize_field( | ||||
|         &self, | ||||
|         field_id: FieldId, | ||||
|         field_name: &str, | ||||
|         value: &Value, | ||||
|         token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>, | ||||
|         field_position: &mut HashMap<u16, u32>, | ||||
|     ) -> Result<()> { | ||||
|         let position = field_position | ||||
|             .entry(field_id) | ||||
|             .and_modify(|counter| *counter += MAX_DISTANCE) | ||||
|             .or_insert(0); | ||||
|         if *position >= self.max_positions_per_attributes { | ||||
|             return Ok(()); | ||||
|         } | ||||
|  | ||||
|         let text; | ||||
|         let tokens = match value { | ||||
|             Value::Number(n) => { | ||||
|                 text = n.to_string(); | ||||
|                 self.tokenizer.tokenize(text.as_str()) | ||||
|             } | ||||
|             Value::Bool(b) => { | ||||
|                 text = b.to_string(); | ||||
|                 self.tokenizer.tokenize(text.as_str()) | ||||
|             } | ||||
|             Value::String(text) => { | ||||
|                 let locales = self | ||||
|                     .localized_attributes_rules | ||||
|                     .iter() | ||||
|                     .find(|rule| rule.match_str(field_name) == PatternMatch::Match) | ||||
|                     .map(|rule| rule.locales()); | ||||
|                 self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) | ||||
|             } | ||||
|             _ => return Ok(()), | ||||
|         }; | ||||
|  | ||||
|         // create an iterator of token with their positions. | ||||
|         let tokens = process_tokens(*position, tokens) | ||||
|             .take_while(|(p, _)| *p < self.max_positions_per_attributes); | ||||
|  | ||||
|         for (index, token) in tokens { | ||||
|             // keep a word only if it is not empty and fit in a LMDB key. | ||||
|             let token = token.lemma().trim(); | ||||
|             if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||
|                 *position = index; | ||||
|                 if let Ok(position) = (*position).try_into() { | ||||
|                     token_fn(field_name, field_id, position, token)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -215,15 +221,20 @@ mod test { | ||||
|         let mut tb = TokenizerBuilder::default(); | ||||
|         let document_tokenizer = DocumentTokenizer { | ||||
|             tokenizer: &tb.build(), | ||||
|             attribute_to_extract: None, | ||||
|             attribute_to_skip: &["not-me", "me-nether.nope"], | ||||
|             localized_attributes_rules: &[], | ||||
|             max_positions_per_attributes: 1000, | ||||
|         }; | ||||
|  | ||||
|         let fields_ids_map = FieldIdMapWithMetadata::new( | ||||
|             fields_ids_map, | ||||
|             MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), | ||||
|             MetadataBuilder::new( | ||||
|                 Default::default(), | ||||
|                 Default::default(), | ||||
|                 Default::default(), | ||||
|                 None, | ||||
|                 None, | ||||
|                 Default::default(), | ||||
|             ), | ||||
|         ); | ||||
|  | ||||
|         let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); | ||||
| @@ -265,6 +276,10 @@ mod test { | ||||
|                 2, | ||||
|                 16, | ||||
|             ]: "catto", | ||||
|             [ | ||||
|                 3, | ||||
|                 0, | ||||
|             ]: "unsearchable", | ||||
|             [ | ||||
|                 5, | ||||
|                 0, | ||||
| @@ -277,6 +292,10 @@ mod test { | ||||
|                 8, | ||||
|                 0, | ||||
|             ]: "23", | ||||
|             [ | ||||
|                 9, | ||||
|                 0, | ||||
|             ]: "unsearchable", | ||||
|         } | ||||
|         "###); | ||||
|     } | ||||
|   | ||||
| @@ -199,7 +199,7 @@ where | ||||
|             let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); | ||||
|             let _entered = span.enter(); | ||||
|  | ||||
|             <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction( | ||||
|             WordPairProximityDocidsExtractor::run_extraction( | ||||
|                 document_changes, | ||||
|                 indexing_context, | ||||
|                 extractor_allocs, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user