mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Integrate first searchable exctrator
This commit is contained in:
		| @@ -1,10 +1,12 @@ | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
|  |  | ||||||
| use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; | use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; | ||||||
|  | use grenad::Merger; | ||||||
| use heed::types::Bytes; | use heed::types::Bytes; | ||||||
|  |  | ||||||
| use super::StdResult; | use super::StdResult; | ||||||
| use crate::update::new::KvReaderFieldId; | use crate::update::new::KvReaderFieldId; | ||||||
|  | use crate::update::MergeDeladdCboRoaringBitmaps; | ||||||
| use crate::{DocumentId, Index}; | use crate::{DocumentId, Index}; | ||||||
|  |  | ||||||
| /// The capacity of the channel is currently in number of messages. | /// The capacity of the channel is currently in number of messages. | ||||||
| @@ -159,7 +161,7 @@ impl DocumentSender { | |||||||
| } | } | ||||||
|  |  | ||||||
| pub enum MergerOperation { | pub enum MergerOperation { | ||||||
|     WordDocidsCursors(Vec<grenad::ReaderCursor<File>>), |     WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct MergerReceiver(Receiver<MergerOperation>); | pub struct MergerReceiver(Receiver<MergerOperation>); | ||||||
| @@ -175,3 +177,16 @@ impl IntoIterator for MergerReceiver { | |||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>); | pub struct DeladdCboRoaringBitmapSender(Sender<MergerOperation>); | ||||||
|  |  | ||||||
|  | impl DeladdCboRoaringBitmapSender { | ||||||
|  |     pub fn word_docids( | ||||||
|  |         &self, | ||||||
|  |         merger: Merger<File, MergeDeladdCboRoaringBitmaps>, | ||||||
|  |     ) -> StdResult<(), SendError<()>> { | ||||||
|  |         let operation = MergerOperation::WordDocidsMerger(merger); | ||||||
|  |         match self.0.send(operation) { | ||||||
|  |             Ok(()) => Ok(()), | ||||||
|  |             Err(SendError(_)) => Err(SendError(())), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ use heed::RoTxn; | |||||||
| use obkv::KvReader; | use obkv::KvReader; | ||||||
|  |  | ||||||
| use crate::update::new::KvReaderFieldId; | use crate::update::new::KvReaderFieldId; | ||||||
| use crate::{DocumentId, FieldId}; | use crate::{DocumentId, FieldId, Index}; | ||||||
|  |  | ||||||
| pub enum DocumentChange { | pub enum DocumentChange { | ||||||
|     Deletion(Deletion), |     Deletion(Deletion), | ||||||
| @@ -12,14 +12,14 @@ pub enum DocumentChange { | |||||||
|  |  | ||||||
| pub struct Deletion { | pub struct Deletion { | ||||||
|     docid: DocumentId, |     docid: DocumentId, | ||||||
|     external_docid: String, // ? |     external_docid: String,        // ? | ||||||
|     current: Box<KvReaderFieldId>, |     current: Box<KvReaderFieldId>, // ? | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct Update { | pub struct Update { | ||||||
|     docid: DocumentId, |     docid: DocumentId, | ||||||
|     external_docid: String, // ? |     external_docid: String,        // ? | ||||||
|     current: Box<KvReaderFieldId>, |     current: Box<KvReaderFieldId>, // ? | ||||||
|     new: Box<KvReaderFieldId>, |     new: Box<KvReaderFieldId>, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -30,7 +30,7 @@ pub struct Insertion { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl DocumentChange { | impl DocumentChange { | ||||||
|     fn docid(&self) -> DocumentId { |     pub fn docid(&self) -> DocumentId { | ||||||
|         match &self { |         match &self { | ||||||
|             Self::Deletion(inner) => inner.docid(), |             Self::Deletion(inner) => inner.docid(), | ||||||
|             Self::Update(inner) => inner.docid(), |             Self::Update(inner) => inner.docid(), | ||||||
| @@ -48,11 +48,11 @@ impl Deletion { | |||||||
|         Self { docid, external_docid, current } |         Self { docid, external_docid, current } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn docid(&self) -> DocumentId { |     pub fn docid(&self) -> DocumentId { | ||||||
|         self.docid |         self.docid | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> { |     pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> { | ||||||
|         unimplemented!() |         unimplemented!() | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -62,11 +62,11 @@ impl Insertion { | |||||||
|         Insertion { docid, external_docid, new } |         Insertion { docid, external_docid, new } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn docid(&self) -> DocumentId { |     pub fn docid(&self) -> DocumentId { | ||||||
|         self.docid |         self.docid | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn new(&self) -> &KvReader<FieldId> { |     pub fn new(&self) -> &KvReader<FieldId> { | ||||||
|         unimplemented!() |         unimplemented!() | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -81,15 +81,15 @@ impl Update { | |||||||
|         Update { docid, external_docid, current, new } |         Update { docid, external_docid, current, new } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn docid(&self) -> DocumentId { |     pub fn docid(&self) -> DocumentId { | ||||||
|         self.docid |         self.docid | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn current(&self, rtxn: &RoTxn) -> &KvReader<FieldId> { |     pub fn current(&self, rtxn: &RoTxn, index: &Index) -> &KvReader<FieldId> { | ||||||
|         unimplemented!() |         unimplemented!() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn new(&self) -> &KvReader<FieldId> { |     pub fn new(&self) -> &KvReader<FieldId> { | ||||||
|         unimplemented!() |         unimplemented!() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -2,12 +2,12 @@ use std::borrow::Cow; | |||||||
| use std::num::NonZeroUsize; | use std::num::NonZeroUsize; | ||||||
| use std::{io, mem}; | use std::{io, mem}; | ||||||
|  |  | ||||||
| use grenad2::{MergeFunction, Sorter}; | use grenad::{MergeFunction, Sorter}; | ||||||
| use lru::LruCache; | use lru::LruCache; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use smallvec::SmallVec; | use smallvec::SmallVec; | ||||||
|  |  | ||||||
| use crate::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct CachedSorter<MF> { | pub struct CachedSorter<MF> { | ||||||
|   | |||||||
| @@ -1,84 +1,180 @@ | |||||||
| pub fn extract_word_docids( | use std::fs::File; | ||||||
|     document_change: DocumentChange, |  | ||||||
|     _tokenizer: &Tokenizer, | use charabia::TokenizerBuilder; | ||||||
|     output: &mut CachedSorter<DelAddRoaringBitmapMerger>, | use grenad::Merger; | ||||||
| ) -> grenad::Result<(), io::Error> { | use grenad::ReaderCursor; | ||||||
|     match document_change { | use heed::RoTxn; | ||||||
|         DocumentChange::Deletion(inner) => { | use rayon::iter::IntoParallelIterator; | ||||||
|             unimplemented!() | use rayon::iter::ParallelBridge; | ||||||
|         } | use rayon::iter::ParallelIterator; | ||||||
|         DocumentChange::Update(inner) => { |  | ||||||
|             unimplemented!() | use crate::update::MergeDeladdCboRoaringBitmaps; | ||||||
|         } | use crate::{ | ||||||
|         DocumentChange::Insertion(inner) => { |     update::{ | ||||||
|             unimplemented!() |         create_sorter, | ||||||
|  |         new::{DocumentChange, ItemsPool}, | ||||||
|  |         GrenadParameters, | ||||||
|  |     }, | ||||||
|  |     FieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE, | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | use super::{ | ||||||
|  |     cache::{CachedSorter, DelAddRoaringBitmapMerger}, | ||||||
|  |     tokenize_document::DocumentTokenizer, | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | pub trait SearchableExtractor { | ||||||
|  |     fn run_extraction( | ||||||
|  |         index: &Index, | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         indexer: GrenadParameters, | ||||||
|  |         document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>, | ||||||
|  |     ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> { | ||||||
|  |         let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn()?; | ||||||
|  |         let stop_words = index.stop_words(&rtxn)?; | ||||||
|  |         let allowed_separators = index.allowed_separators(&rtxn)?; | ||||||
|  |         let allowed_separators: Option<Vec<_>> = | ||||||
|  |             allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |         let dictionary = index.dictionary(&rtxn)?; | ||||||
|  |         let dictionary: Option<Vec<_>> = | ||||||
|  |             dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |         let builder = tokenizer_builder( | ||||||
|  |             stop_words.as_ref(), | ||||||
|  |             allowed_separators.as_deref(), | ||||||
|  |             dictionary.as_deref(), | ||||||
|  |         ); | ||||||
|  |         let tokenizer = builder.into_tokenizer(); | ||||||
|  |  | ||||||
|  |         let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn)?; | ||||||
|  |         let localized_attributes_rules = | ||||||
|  |             index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||||
|  |  | ||||||
|  |         let document_tokenizer = DocumentTokenizer { | ||||||
|  |             tokenizer: &tokenizer, | ||||||
|  |             searchable_attributes: user_defined_searchable_fields.as_deref(), | ||||||
|  |             localized_attributes_rules: &localized_attributes_rules, | ||||||
|  |             max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         let context_pool = ItemsPool::new(|| { | ||||||
|  |             Ok(( | ||||||
|  |                 index.read_txn()?, | ||||||
|  |                 &document_tokenizer, | ||||||
|  |                 CachedSorter::new( | ||||||
|  |                     // TODO use a better value | ||||||
|  |                     100.try_into().unwrap(), | ||||||
|  |                     create_sorter( | ||||||
|  |                         grenad::SortAlgorithm::Stable, | ||||||
|  |                         DelAddRoaringBitmapMerger, | ||||||
|  |                         indexer.chunk_compression_type, | ||||||
|  |                         indexer.chunk_compression_level, | ||||||
|  |                         indexer.max_nb_chunks, | ||||||
|  |                         max_memory, | ||||||
|  |                     ), | ||||||
|  |                 ), | ||||||
|  |             )) | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |         document_changes.into_par_iter().try_for_each(|document_change| { | ||||||
|  |             context_pool.with(|(rtxn, document_tokenizer, cached_sorter)| { | ||||||
|  |                 Self::extract_document_change( | ||||||
|  |                     &*rtxn, | ||||||
|  |                     index, | ||||||
|  |                     document_tokenizer, | ||||||
|  |                     &fields_ids_map, | ||||||
|  |                     cached_sorter, | ||||||
|  |                     document_change?, | ||||||
|  |                 ) | ||||||
|  |             }) | ||||||
|  |         })?; | ||||||
|  |  | ||||||
|  |         let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); | ||||||
|  |         for (_rtxn, _tokenizer, cache) in context_pool.into_items() { | ||||||
|  |             let sorter = cache.into_sorter()?; | ||||||
|  |             let readers = sorter.into_reader_cursors()?; | ||||||
|  |             builder.extend(readers); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         Ok(builder.build()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let normalizer_options = NormalizerOption::default(); |     fn extract_document_change( | ||||||
|  |         rtxn: &RoTxn, | ||||||
|     if let Some(previous_doc) = previous_doc { |         index: &Index, | ||||||
|         for (_, v) in previous_doc.iter() { |         document_tokenizer: &DocumentTokenizer, | ||||||
|             // Only manage the direct JSON strings |         fields_ids_map: &FieldsIdsMap, | ||||||
|             // TODO manage the JSON strings correctly (escaped chars) |         cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>, | ||||||
|             if v.first().zip(v.last()) == Some((&b'"', &b'"')) { |         document_change: DocumentChange, | ||||||
|                 let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); |     ) -> Result<()>; | ||||||
|                 // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { |  | ||||||
|                 //     let key = token.lemma().normalize(&normalizer_options); |  | ||||||
|                 for token in s.split_whitespace() { |  | ||||||
|                     let key = token.normalize(&normalizer_options); |  | ||||||
|                     output.insert_del_u32(key.as_bytes(), docid)?; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     for (_, v) in new_doc.iter() { |  | ||||||
|         // Only manage the direct JSON strings |  | ||||||
|         // TODO manage the JSON strings correctly (escaped chars) |  | ||||||
|         if v.first().zip(v.last()) == Some((&b'"', &b'"')) { |  | ||||||
|             let s = std::str::from_utf8(&v[1..v.len() - 1]).unwrap(); |  | ||||||
|             // for token in tokenizer.tokenize(s).filter(|t| t.is_word()) { |  | ||||||
|             //     let key = token.lemma().normalize(&normalizer_options); |  | ||||||
|             for token in s.split_whitespace() { |  | ||||||
|                 let key = token.normalize(&normalizer_options); |  | ||||||
|                 output.insert_add_u32(key.as_bytes(), docid)?; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     Ok(()) |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// take an iterator on tokens and compute their relative position depending on separator kinds | pub struct WordDocidsExtractor; | ||||||
| /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | impl SearchableExtractor for WordDocidsExtractor { | ||||||
| /// else we keep the standard proximity of 1 between words. |     fn extract_document_change( | ||||||
| fn process_tokens<'a>( |         rtxn: &RoTxn, | ||||||
|     tokens: impl Iterator<Item = Token<'a>>, |         index: &Index, | ||||||
| ) -> impl Iterator<Item = (usize, Token<'a>)> { |         document_tokenizer: &DocumentTokenizer, | ||||||
|     tokens |         fields_ids_map: &FieldsIdsMap, | ||||||
|         .skip_while(|token| token.is_separator()) |         // TODO: DelAddRoaringBitmapMerger should be CBO | ||||||
|         .scan((0, None), |(offset, prev_kind), mut token| { |         cached_sorter: &mut CachedSorter<DelAddRoaringBitmapMerger>, | ||||||
|             match token.kind { |         document_change: DocumentChange, | ||||||
|                 TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { |     ) -> crate::Result<()> { | ||||||
|                     *offset += match *prev_kind { |         match document_change { | ||||||
|                         Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, |             DocumentChange::Deletion(inner) => { | ||||||
|                         Some(_) => 1, |                 let mut token_fn = |_fid, _pos: u16, word: &str| { | ||||||
|                         None => 0, |                     cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); | ||||||
|                     }; |                 }; | ||||||
|                     *prev_kind = Some(token.kind) |                 document_tokenizer.tokenize_document( | ||||||
|                 } |                     inner.current(rtxn, index), | ||||||
|                 TokenKind::Separator(SeparatorKind::Hard) => { |                     fields_ids_map, | ||||||
|                     *prev_kind = Some(token.kind); |                     &mut token_fn, | ||||||
|                 } |                 )?; | ||||||
|                 TokenKind::Separator(SeparatorKind::Soft) |  | ||||||
|                     if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => |  | ||||||
|                 { |  | ||||||
|                     *prev_kind = Some(token.kind); |  | ||||||
|                 } |  | ||||||
|                 _ => token.kind = TokenKind::Unknown, |  | ||||||
|             } |             } | ||||||
|             Some((*offset, token)) |             DocumentChange::Update(inner) => { | ||||||
|         }) |                 let mut token_fn = |_fid, _pos, word: &str| { | ||||||
|         .filter(|(_, t)| t.is_word()) |                     cached_sorter.insert_del_u32(word.as_bytes(), inner.docid()).unwrap(); | ||||||
|  |                 }; | ||||||
|  |                 document_tokenizer.tokenize_document( | ||||||
|  |                     inner.current(rtxn, index), | ||||||
|  |                     fields_ids_map, | ||||||
|  |                     &mut token_fn, | ||||||
|  |                 )?; | ||||||
|  |  | ||||||
|  |                 let mut token_fn = |_fid, _pos, word: &str| { | ||||||
|  |                     cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); | ||||||
|  |                 }; | ||||||
|  |                 document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; | ||||||
|  |             } | ||||||
|  |             DocumentChange::Insertion(inner) => { | ||||||
|  |                 let mut token_fn = |_fid, _pos, word: &str| { | ||||||
|  |                     cached_sorter.insert_add_u32(word.as_bytes(), inner.docid()).unwrap(); | ||||||
|  |                 }; | ||||||
|  |                 document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Factorize tokenizer building. | ||||||
|  | fn tokenizer_builder<'a>( | ||||||
|  |     stop_words: Option<&'a fst::Set<&'a [u8]>>, | ||||||
|  |     allowed_separators: Option<&'a [&str]>, | ||||||
|  |     dictionary: Option<&'a [&str]>, | ||||||
|  | ) -> TokenizerBuilder<'a, &'a [u8]> { | ||||||
|  |     let mut tokenizer_builder = TokenizerBuilder::new(); | ||||||
|  |     if let Some(stop_words) = stop_words { | ||||||
|  |         tokenizer_builder.stop_words(stop_words); | ||||||
|  |     } | ||||||
|  |     if let Some(dictionary) = dictionary { | ||||||
|  |         tokenizer_builder.words_dict(dictionary); | ||||||
|  |     } | ||||||
|  |     if let Some(separators) = allowed_separators { | ||||||
|  |         tokenizer_builder.separators(separators); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     tokenizer_builder | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,2 +1,6 @@ | |||||||
| mod cache; | mod cache; | ||||||
| mod extract_word_docids; | mod extract_word_docids; | ||||||
|  | mod tokenize_document; | ||||||
|  |  | ||||||
|  | pub use extract_word_docids::SearchableExtractor; | ||||||
|  | pub use extract_word_docids::WordDocidsExtractor; | ||||||
|   | |||||||
| @@ -1,56 +1,71 @@ | |||||||
| pub struct DocumentTokenizer { | use crate::{ | ||||||
|     tokenizer: &Tokenizer, |     update::new::KvReaderFieldId, FieldId, FieldsIdsMap, Index, InternalError, | ||||||
|     searchable_attributes: Option<&[String]>, |     LocalizedAttributesRule, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, | ||||||
|     localized_attributes_rules: &[LocalizedAttributesRule], | }; | ||||||
|     max_positions_per_attributes: u32, | use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | ||||||
|  | use heed::RoTxn; | ||||||
|  | use serde_json::Value; | ||||||
|  | use std::collections::HashMap; | ||||||
|  |  | ||||||
|  | pub struct DocumentTokenizer<'a> { | ||||||
|  |     pub tokenizer: &'a Tokenizer<'a>, | ||||||
|  |     pub searchable_attributes: Option<&'a [&'a str]>, | ||||||
|  |     pub localized_attributes_rules: &'a [LocalizedAttributesRule], | ||||||
|  |     pub max_positions_per_attributes: u32, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl DocumentTokenizer { | impl<'a> DocumentTokenizer<'a> { | ||||||
|     // pub fn new(tokenizer: &Tokenizer, settings: &InnerIndexSettings) -> Self { |     pub fn tokenize_document( | ||||||
|     //     Self { tokenizer, settings } |         &self, | ||||||
|     // } |         obkv: &KvReaderFieldId, | ||||||
|  |  | ||||||
|     pub fn tokenize_document<'a>( |  | ||||||
|         obkv: &KvReader<'a, FieldId>, |  | ||||||
|         field_id_map: &FieldsIdsMap, |         field_id_map: &FieldsIdsMap, | ||||||
|         token_fn: impl Fn(FieldId, u16, &str), |         token_fn: &mut impl FnMut(FieldId, u16, &str), | ||||||
|     ) { |     ) -> Result<()> { | ||||||
|         let mut field_position = Hashmap::new(); |         let mut field_position = HashMap::new(); | ||||||
|         for (field_id, field_bytes) in obkv { |         for (field_id, field_bytes) in obkv { | ||||||
|             let field_name = field_id_map.name(field_id); |             let Some(field_name) = field_id_map.name(field_id) else { | ||||||
|  |                 unreachable!("field id not found in field id map"); | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             let mut tokenize_field = |name: &str, value: &Value| { | ||||||
|  |                 let Some(field_id) = field_id_map.id(name) else { | ||||||
|  |                     unreachable!("field name not found in field id map"); | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |                 let position = | ||||||
|  |                     field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0); | ||||||
|  |                 if *position as u32 >= self.max_positions_per_attributes { | ||||||
|  |                     return; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|             let tokenize_field = |name, value| { |  | ||||||
|                 let field_id = field_id_map.id(name); |  | ||||||
|                 match value { |                 match value { | ||||||
|                     Number(n) => { |                     Value::Number(n) => { | ||||||
|                         let token = n.to_string(); |                         let token = n.to_string(); | ||||||
|                         let position = field_position |                         if let Ok(position) = (*position).try_into() { | ||||||
|                             .entry(field_id) |                             token_fn(field_id, position, token.as_str()); | ||||||
|                             .and_modify(|counter| *counter += 8) |                         } | ||||||
|                             .or_insert(0); |  | ||||||
|                         token_fn(field_id, position, token.as_str()); |  | ||||||
|                     } |                     } | ||||||
|                     String(text) => { |                     Value::String(text) => { | ||||||
|                         // create an iterator of token with their positions. |                         // create an iterator of token with their positions. | ||||||
|                         let locales = self |                         let locales = self | ||||||
|                             .localized_attributes_rules |                             .localized_attributes_rules | ||||||
|                             .iter() |                             .iter() | ||||||
|                             .first(|rule| rule.match_str(field_name)) |                             .find(|rule| rule.match_str(field_name)) | ||||||
|                             .map(|rule| rule.locales(field_id)); |                             .map(|rule| rule.locales()); | ||||||
|                         let tokens = |                         let tokens = process_tokens( | ||||||
|                             process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) |                             *position, | ||||||
|                                 .take_while(|(p, _)| { |                             self.tokenizer.tokenize_with_allow_list(text.as_str(), locales), | ||||||
|                                     (*p as u32) < self.max_positions_per_attributes |                         ) | ||||||
|                                 }); |                         .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes); | ||||||
|  |  | ||||||
|                         for (index, token) in tokens { |                         for (index, token) in tokens { | ||||||
|                             // keep a word only if it is not empty and fit in a LMDB key. |                             // keep a word only if it is not empty and fit in a LMDB key. | ||||||
|                             let token = token.lemma().trim(); |                             let token = token.lemma().trim(); | ||||||
|                             if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { |                             if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||||
|                                 let position: u16 = index |                                 *position = index; | ||||||
|                                     .try_into() |                                 if let Ok(position) = (*position).try_into() { | ||||||
|                                     .map_err(|_| SerializationError::InvalidNumberSerialization)?; |                                     token_fn(field_id, position, token); | ||||||
|                                 writer.insert(position, token.as_bytes())?; |                                 } | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
| @@ -59,21 +74,28 @@ impl DocumentTokenizer { | |||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // if the current field is searchable or contains a searchable attribute |             // if the current field is searchable or contains a searchable attribute | ||||||
|             if searchable_attributes.map_or(true, |attributes| { |             if self.searchable_attributes.map_or(true, |attributes| { | ||||||
|                 attributes.iter().any(|name| contained_in(name, field_name)) |                 attributes.iter().any(|name| perm_json_p::contained_in(name, field_name)) | ||||||
|             }) { |             }) { | ||||||
|                 // parse json. |                 // parse json. | ||||||
|                 match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { |                 match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? { | ||||||
|                     Value::Object(object) => { |                     Value::Object(object) => perm_json_p::seek_leaf_values_in_object( | ||||||
|                         seek_leaf_values_in_object(object, selectors, &field_name, tokenize_field) |                         &object, | ||||||
|                     } |                         self.searchable_attributes.as_deref(), | ||||||
|                     Value::Array(array) => { |                         &field_name, | ||||||
|                         seek_leaf_values_in_array(array, selectors, &field_name, tokenize_field) |                         &mut tokenize_field, | ||||||
|                     } |                     ), | ||||||
|                     value => tokenize_field(&base_key, value), |                     Value::Array(array) => perm_json_p::seek_leaf_values_in_array( | ||||||
|  |                         &array, | ||||||
|  |                         self.searchable_attributes.as_deref(), | ||||||
|  |                         &field_name, | ||||||
|  |                         &mut tokenize_field, | ||||||
|  |                     ), | ||||||
|  |                     value => tokenize_field(&field_name, &value), | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |         Ok(()) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -81,11 +103,12 @@ impl DocumentTokenizer { | |||||||
| /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | ||||||
| /// else we keep the standard proximity of 1 between words. | /// else we keep the standard proximity of 1 between words. | ||||||
| fn process_tokens<'a>( | fn process_tokens<'a>( | ||||||
|  |     start_offset: usize, | ||||||
|     tokens: impl Iterator<Item = Token<'a>>, |     tokens: impl Iterator<Item = Token<'a>>, | ||||||
| ) -> impl Iterator<Item = (usize, Token<'a>)> { | ) -> impl Iterator<Item = (usize, Token<'a>)> { | ||||||
|     tokens |     tokens | ||||||
|         .skip_while(|token| token.is_separator()) |         .skip_while(|token| token.is_separator()) | ||||||
|         .scan((0, None), |(offset, prev_kind), mut token| { |         .scan((start_offset, None), |(offset, prev_kind), mut token| { | ||||||
|             match token.kind { |             match token.kind { | ||||||
|                 TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { |                 TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => { | ||||||
|                     *offset += match *prev_kind { |                     *offset += match *prev_kind { | ||||||
| @@ -110,42 +133,45 @@ fn process_tokens<'a>( | |||||||
|         .filter(|(_, t)| t.is_word()) |         .filter(|(_, t)| t.is_word()) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Returns `true` if the `selector` match the `key`. |  | ||||||
| /// |  | ||||||
| /// ```text |  | ||||||
| /// Example: |  | ||||||
| /// `animaux`           match `animaux` |  | ||||||
| /// `animaux.chien`     match `animaux` |  | ||||||
| /// `animaux.chien`     match `animaux` |  | ||||||
| /// `animaux.chien.nom` match `animaux` |  | ||||||
| /// `animaux.chien.nom` match `animaux.chien` |  | ||||||
| /// ----------------------------------------- |  | ||||||
| /// `animaux`    doesn't match `animaux.chien` |  | ||||||
| /// `animaux.`   doesn't match `animaux` |  | ||||||
| /// `animaux.ch` doesn't match `animaux.chien` |  | ||||||
| /// `animau`     doesn't match `animaux` |  | ||||||
| /// ``` |  | ||||||
| fn contained_in(selector: &str, key: &str) -> bool { |  | ||||||
|     selector.starts_with(key) |  | ||||||
|         && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// TODO move in permissive json pointer | /// TODO move in permissive json pointer | ||||||
| mod perm_json_p { | mod perm_json_p { | ||||||
|  |     use serde_json::{Map, Value}; | ||||||
|  |     const SPLIT_SYMBOL: char = '.'; | ||||||
|  |  | ||||||
|  |     /// Returns `true` if the `selector` match the `key`. | ||||||
|  |     /// | ||||||
|  |     /// ```text | ||||||
|  |     /// Example: | ||||||
|  |     /// `animaux`           match `animaux` | ||||||
|  |     /// `animaux.chien`     match `animaux` | ||||||
|  |     /// `animaux.chien`     match `animaux` | ||||||
|  |     /// `animaux.chien.nom` match `animaux` | ||||||
|  |     /// `animaux.chien.nom` match `animaux.chien` | ||||||
|  |     /// ----------------------------------------- | ||||||
|  |     /// `animaux`    doesn't match `animaux.chien` | ||||||
|  |     /// `animaux.`   doesn't match `animaux` | ||||||
|  |     /// `animaux.ch` doesn't match `animaux.chien` | ||||||
|  |     /// `animau`     doesn't match `animaux` | ||||||
|  |     /// ``` | ||||||
|  |     pub fn contained_in(selector: &str, key: &str) -> bool { | ||||||
|  |         selector.starts_with(key) | ||||||
|  |             && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn seek_leaf_values<'a>( |     pub fn seek_leaf_values<'a>( | ||||||
|         value: &Map<String, Value>, |         value: &Map<String, Value>, | ||||||
|         selectors: impl IntoIterator<Item = &'a str>, |         selectors: impl IntoIterator<Item = &'a str>, | ||||||
|         seeker: impl Fn(&str, &Value), |         seeker: &mut impl FnMut(&str, &Value), | ||||||
|     ) { |     ) { | ||||||
|         let selectors: Vec<_> = selectors.into_iter().collect(); |         let selectors: Vec<_> = selectors.into_iter().collect(); | ||||||
|         seek_leaf_values_in_object(value, &selectors, "", &seeker); |         seek_leaf_values_in_object(value, Some(&selectors), "", seeker); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn seek_leaf_values_in_object( |     pub fn seek_leaf_values_in_object( | ||||||
|         value: &Map<String, Value>, |         value: &Map<String, Value>, | ||||||
|         selectors: &[&str], |         selectors: Option<&[&str]>, | ||||||
|         base_key: &str, |         base_key: &str, | ||||||
|         seeker: &impl Fn(&str, &Value), |         seeker: &mut impl FnMut(&str, &Value), | ||||||
|     ) { |     ) { | ||||||
|         for (key, value) in value.iter() { |         for (key, value) in value.iter() { | ||||||
|             let base_key = if base_key.is_empty() { |             let base_key = if base_key.is_empty() { | ||||||
| @@ -156,8 +182,10 @@ mod perm_json_p { | |||||||
|  |  | ||||||
|             // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` |             // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` | ||||||
|             // so we check the contained_in on both side |             // so we check the contained_in on both side | ||||||
|             let should_continue = selectors.iter().any(|selector| { |             let should_continue = selectors.map_or(true, |selectors| { | ||||||
|                 contained_in(selector, &base_key) || contained_in(&base_key, selector) |                 selectors.iter().any(|selector| { | ||||||
|  |                     contained_in(selector, &base_key) || contained_in(&base_key, selector) | ||||||
|  |                 }) | ||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             if should_continue { |             if should_continue { | ||||||
| @@ -175,12 +203,12 @@ mod perm_json_p { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn seek_leaf_values_in_array( |     pub fn seek_leaf_values_in_array( | ||||||
|         values: &mut [Value], |         values: &[Value], | ||||||
|         selectors: &[&str], |         selectors: Option<&[&str]>, | ||||||
|         base_key: &str, |         base_key: &str, | ||||||
|         seeker: &impl Fn(&str, &Value), |         seeker: &mut impl FnMut(&str, &Value), | ||||||
|     ) { |     ) { | ||||||
|         for value in values.iter_mut() { |         for value in values { | ||||||
|             match value { |             match value { | ||||||
|                 Value::Object(object) => { |                 Value::Object(object) => { | ||||||
|                     seek_leaf_values_in_object(object, selectors, base_key, seeker) |                     seek_leaf_values_in_object(object, selectors, base_key, seeker) | ||||||
| @@ -193,3 +221,91 @@ mod perm_json_p { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod test { | ||||||
|  |     use super::*; | ||||||
|  |     use charabia::TokenizerBuilder; | ||||||
|  |     use meili_snap::snapshot; | ||||||
|  |     use obkv::KvReader; | ||||||
|  |     use serde_json::json; | ||||||
|  |     #[test] | ||||||
|  |     fn test_tokenize_document() { | ||||||
|  |         let mut fields_ids_map = FieldsIdsMap::new(); | ||||||
|  |  | ||||||
|  |         let field_1 = json!({ | ||||||
|  |                 "name": "doggo", | ||||||
|  |                 "age": 10, | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |         let field_2 = json!({ | ||||||
|  |                 "catto": { | ||||||
|  |                     "name": "pesti", | ||||||
|  |                     "age": 23, | ||||||
|  |                 } | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |         let field_3 = json!(["doggo", "catto"]); | ||||||
|  |  | ||||||
|  |         let mut obkv = obkv::KvWriter::memory(); | ||||||
|  |         let field_1_id = fields_ids_map.insert("doggo").unwrap(); | ||||||
|  |         let field_1 = serde_json::to_string(&field_1).unwrap(); | ||||||
|  |         obkv.insert(field_1_id, field_1.as_bytes()).unwrap(); | ||||||
|  |         let field_2_id = fields_ids_map.insert("catto").unwrap(); | ||||||
|  |         let field_2 = serde_json::to_string(&field_2).unwrap(); | ||||||
|  |         obkv.insert(field_2_id, field_2.as_bytes()).unwrap(); | ||||||
|  |         let field_3_id = fields_ids_map.insert("doggo.name").unwrap(); | ||||||
|  |         let field_3 = serde_json::to_string(&field_3).unwrap(); | ||||||
|  |         obkv.insert(field_3_id, field_3.as_bytes()).unwrap(); | ||||||
|  |         let value = obkv.into_inner().unwrap(); | ||||||
|  |         let obkv = KvReader::from_slice(value.as_slice()); | ||||||
|  |  | ||||||
|  |         fields_ids_map.insert("doggo.age"); | ||||||
|  |         fields_ids_map.insert("catto.catto.name"); | ||||||
|  |         fields_ids_map.insert("catto.catto.age"); | ||||||
|  |  | ||||||
|  |         let mut tb = TokenizerBuilder::default(); | ||||||
|  |         let document_tokenizer = DocumentTokenizer { | ||||||
|  |             tokenizer: &tb.build(), | ||||||
|  |             searchable_attributes: None, | ||||||
|  |             localized_attributes_rules: &[], | ||||||
|  |             max_positions_per_attributes: 1000, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         let mut words = std::collections::BTreeMap::new(); | ||||||
|  |         document_tokenizer | ||||||
|  |             .tokenize_document(obkv, &fields_ids_map, &mut |fid, pos, word| { | ||||||
|  |                 words.insert([fid, pos], word.to_string()); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         snapshot!(format!("{:#?}", words), @r###" | ||||||
|  |         { | ||||||
|  |             [ | ||||||
|  |                 2, | ||||||
|  |                 0, | ||||||
|  |             ]: "doggo", | ||||||
|  |             [ | ||||||
|  |                 2, | ||||||
|  |                 8, | ||||||
|  |             ]: "doggo", | ||||||
|  |             [ | ||||||
|  |                 2, | ||||||
|  |                 16, | ||||||
|  |             ]: "catto", | ||||||
|  |             [ | ||||||
|  |                 3, | ||||||
|  |                 0, | ||||||
|  |             ]: "10", | ||||||
|  |             [ | ||||||
|  |                 4, | ||||||
|  |                 0, | ||||||
|  |             ]: "pesti", | ||||||
|  |             [ | ||||||
|  |                 5, | ||||||
|  |                 0, | ||||||
|  |             ]: "23", | ||||||
|  |         } | ||||||
|  |         "###); | ||||||
|  |     } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -15,11 +15,13 @@ use super::channel::{ | |||||||
|     WriterOperation, |     WriterOperation, | ||||||
| }; | }; | ||||||
| use super::document_change::DocumentChange; | use super::document_change::DocumentChange; | ||||||
|  | use super::extract::{SearchableExtractor, WordDocidsExtractor}; | ||||||
| use super::merger::merge_grenad_entries; | use super::merger::merge_grenad_entries; | ||||||
| use super::StdResult; | use super::StdResult; | ||||||
| use crate::documents::{ | use crate::documents::{ | ||||||
|     obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, |     obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, PrimaryKey, DEFAULT_PRIMARY_KEY, | ||||||
| }; | }; | ||||||
|  | use crate::update::GrenadParameters; | ||||||
| use crate::{Index, Result, UserError}; | use crate::{Index, Result, UserError}; | ||||||
|  |  | ||||||
| mod document_deletion; | mod document_deletion; | ||||||
| @@ -45,7 +47,7 @@ pub fn index<PI>( | |||||||
|     wtxn: &mut RwTxn, |     wtxn: &mut RwTxn, | ||||||
|     index: &Index, |     index: &Index, | ||||||
|     pool: &ThreadPool, |     pool: &ThreadPool, | ||||||
|     _document_changes: PI, |     document_changes: PI, | ||||||
| ) -> Result<()> | ) -> Result<()> | ||||||
| where | where | ||||||
|     PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send, |     PI: IntoParallelIterator<Item = Result<DocumentChange>> + Send, | ||||||
| @@ -59,10 +61,18 @@ where | |||||||
|         // TODO manage the errors correctly |         // TODO manage the errors correctly | ||||||
|         let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { |         let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, || { | ||||||
|             pool.in_place_scope(|_s| { |             pool.in_place_scope(|_s| { | ||||||
|  |                 let document_changes = document_changes.into_par_iter(); | ||||||
|                 // word docids |                 // word docids | ||||||
|                 // document_changes.into_par_iter().try_for_each(|_dc| Ok(()) as Result<_>) |                 let merger = WordDocidsExtractor::run_extraction( | ||||||
|                 // let grenads = extractor_function(document_changes)?; |                     index, | ||||||
|                 // deladd_cbo_roaring_bitmap_sender.word_docids(grenads)?; |                     todo!(), | ||||||
|  |                     /// TODO: GrenadParameters::default() should be removed in favor a passed parameter | ||||||
|  |                     GrenadParameters::default(), | ||||||
|  |                     document_changes.clone(), | ||||||
|  |                 )?; | ||||||
|  |  | ||||||
|  |                 /// TODO: manage the errors correctly | ||||||
|  |                 deladd_cbo_roaring_bitmap_sender.word_docids(merger).unwrap(); | ||||||
|  |  | ||||||
|                 Ok(()) as Result<_> |                 Ok(()) as Result<_> | ||||||
|             }) |             }) | ||||||
|   | |||||||
| @@ -20,14 +20,12 @@ pub fn merge_grenad_entries( | |||||||
|  |  | ||||||
|     for merger_operation in receiver { |     for merger_operation in receiver { | ||||||
|         match merger_operation { |         match merger_operation { | ||||||
|             MergerOperation::WordDocidsCursors(cursors) => { |             MergerOperation::WordDocidsMerger(merger) => { | ||||||
|                 let sender = sender.word_docids(); |                 let sender = sender.word_docids(); | ||||||
|                 let database = index.word_docids.remap_types::<Bytes, Bytes>(); |                 let database = index.word_docids.remap_types::<Bytes, Bytes>(); | ||||||
|  |  | ||||||
|                 let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps); |  | ||||||
|                 builder.extend(cursors); |  | ||||||
|                 /// TODO manage the error correctly |                 /// TODO manage the error correctly | ||||||
|                 let mut merger_iter = builder.build().into_stream_merger_iter().unwrap(); |                 let mut merger_iter = merger.into_stream_merger_iter().unwrap(); | ||||||
|  |  | ||||||
|                 // TODO manage the error correctly |                 // TODO manage the error correctly | ||||||
|                 while let Some((key, deladd)) = merger_iter.next().unwrap() { |                 while let Some((key, deladd)) = merger_iter.next().unwrap() { | ||||||
|   | |||||||
| @@ -4,13 +4,12 @@ pub use items_pool::ItemsPool; | |||||||
| use super::del_add::DelAdd; | use super::del_add::DelAdd; | ||||||
| use crate::FieldId; | use crate::FieldId; | ||||||
|  |  | ||||||
| mod document_change; |  | ||||||
| mod merger; |  | ||||||
| // mod extract; |  | ||||||
| mod channel; | mod channel; | ||||||
| //mod global_fields_ids_map; | mod document_change; | ||||||
|  | mod extract; | ||||||
| pub mod indexer; | pub mod indexer; | ||||||
| mod items_pool; | mod items_pool; | ||||||
|  | mod merger; | ||||||
|  |  | ||||||
| /// TODO move them elsewhere | /// TODO move them elsewhere | ||||||
| pub type StdResult<T, E> = std::result::Result<T, E>; | pub type StdResult<T, E> = std::result::Result<T, E>; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user