mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	feat: Use the new Tokenizer
This commit is contained in:
		| @@ -430,7 +430,6 @@ mod tests { | ||||
|     use std::error::Error; | ||||
|  | ||||
|     use serde_derive::{Serialize, Deserialize}; | ||||
|     use meilidb_tokenizer::DefaultBuilder; | ||||
|  | ||||
|     use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; | ||||
|  | ||||
| @@ -478,11 +477,10 @@ mod tests { | ||||
|             timestamp: 7654321, | ||||
|         }; | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder::new(); | ||||
|         let mut builder = database.start_update(meilidb_index_name)?; | ||||
|  | ||||
|         let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; | ||||
|         let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; | ||||
|         let docid0 = builder.update_document(&doc0, &stop_words)?; | ||||
|         let docid1 = builder.update_document(&doc1, &stop_words)?; | ||||
|  | ||||
|         let view = database.commit_update(builder)?; | ||||
|  | ||||
| @@ -549,16 +547,14 @@ mod tests { | ||||
|             timestamp: 7654321, | ||||
|         }; | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder::new(); | ||||
|  | ||||
|         let mut builder = database.start_update(meilidb_index_name)?; | ||||
|         let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; | ||||
|         let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; | ||||
|         let docid0 = builder.update_document(&doc0, &stop_words)?; | ||||
|         let docid1 = builder.update_document(&doc1, &stop_words)?; | ||||
|         database.commit_update(builder)?; | ||||
|  | ||||
|         let mut builder = database.start_update(meilidb_index_name)?; | ||||
|         let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; | ||||
|         let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; | ||||
|         let docid2 = builder.update_document(&doc2, &stop_words)?; | ||||
|         let docid3 = builder.update_document(&doc3, &stop_words)?; | ||||
|         let view = database.commit_update(builder)?; | ||||
|  | ||||
|         let de_doc0: SimpleDoc = view.document_by_id(docid0)?; | ||||
| @@ -640,7 +636,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -650,7 +645,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         database.commit_update(builder)?; | ||||
| @@ -688,7 +683,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -698,7 +692,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         database.commit_update(builder)?; | ||||
| @@ -737,7 +731,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -747,7 +740,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         database.commit_update(builder)?; | ||||
| @@ -785,7 +778,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -795,7 +787,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         let view = database.commit_update(builder)?; | ||||
| @@ -833,7 +825,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -843,7 +834,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         let view = database.commit_update(builder)?; | ||||
| @@ -882,7 +873,6 @@ mod bench { | ||||
|             description: String, | ||||
|         } | ||||
|  | ||||
|         let tokenizer_builder = DefaultBuilder; | ||||
|         let mut builder = database.start_update(index_name)?; | ||||
|         let mut rng = XorShiftRng::seed_from_u64(42); | ||||
|  | ||||
| @@ -892,7 +882,7 @@ mod bench { | ||||
|                 title: random_sentences(rng.gen_range(1, 8), &mut rng), | ||||
|                 description: random_sentences(rng.gen_range(20, 200), &mut rng), | ||||
|             }; | ||||
|             builder.update_document(&document, &tokenizer_builder, &stop_words)?; | ||||
|             builder.update_document(&document, &stop_words)?; | ||||
|         } | ||||
|  | ||||
|         let view = database.commit_update(builder)?; | ||||
|   | ||||
| @@ -3,23 +3,20 @@ use std::collections::HashSet; | ||||
| use serde::Serialize; | ||||
| use serde::ser; | ||||
| use meilidb_core::{DocumentId, DocIndex}; | ||||
| use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk}; | ||||
| use meilidb_tokenizer::{Tokenizer, Token, is_cjk}; | ||||
|  | ||||
| use crate::database::update::DocumentUpdate; | ||||
| use crate::database::serde::SerializerError; | ||||
| use crate::database::schema::SchemaAttr; | ||||
|  | ||||
| pub struct IndexerSerializer<'a, 'b, B> { | ||||
|     pub tokenizer_builder: &'a B, | ||||
| pub struct IndexerSerializer<'a, 'b> { | ||||
|     pub update: &'a mut DocumentUpdate<'b>, | ||||
|     pub document_id: DocumentId, | ||||
|     pub attribute: SchemaAttr, | ||||
|     pub stop_words: &'a HashSet<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B> | ||||
| where B: TokenizerBuilder | ||||
| { | ||||
| impl<'a, 'b> ser::Serializer for IndexerSerializer<'a, 'b> { | ||||
|     type Ok = (); | ||||
|     type Error = SerializerError; | ||||
|     type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; | ||||
| @@ -49,7 +46,7 @@ where B: TokenizerBuilder | ||||
|     } | ||||
|  | ||||
|     fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> { | ||||
|         for token in self.tokenizer_builder.build(v) { | ||||
|         for token in Tokenizer::new(v) { | ||||
|             let Token { word, word_index, char_index } = token; | ||||
|             let document_id = self.document_id; | ||||
|  | ||||
|   | ||||
| @@ -2,7 +2,6 @@ use std::collections::HashSet; | ||||
|  | ||||
| use serde::Serialize; | ||||
| use serde::ser; | ||||
| use meilidb_tokenizer::TokenizerBuilder; | ||||
|  | ||||
| use crate::database::serde::indexer_serializer::IndexerSerializer; | ||||
| use crate::database::serde::key_to_string::KeyToStringSerializer; | ||||
| @@ -12,25 +11,22 @@ use crate::database::serde::SerializerError; | ||||
| use crate::database::schema::Schema; | ||||
| use meilidb_core::DocumentId; | ||||
|  | ||||
| pub struct Serializer<'a, 'b, B> { | ||||
| pub struct Serializer<'a, 'b> { | ||||
|     pub schema: &'a Schema, | ||||
|     pub update: &'a mut DocumentUpdate<'b>, | ||||
|     pub document_id: DocumentId, | ||||
|     pub tokenizer_builder: &'a B, | ||||
|     pub stop_words: &'a HashSet<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B> | ||||
| where B: TokenizerBuilder | ||||
| { | ||||
| impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { | ||||
|     type Ok = (); | ||||
|     type Error = SerializerError; | ||||
|     type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; | ||||
|     type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; | ||||
|     type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; | ||||
|     type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; | ||||
|     type SerializeMap = MapSerializer<'a, 'b, B>; | ||||
|     type SerializeStruct = StructSerializer<'a, 'b, B>; | ||||
|     type SerializeMap = MapSerializer<'a, 'b>; | ||||
|     type SerializeStruct = StructSerializer<'a, 'b>; | ||||
|     type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; | ||||
|  | ||||
|     forward_to_unserializable_type! { | ||||
| @@ -142,7 +138,6 @@ where B: TokenizerBuilder | ||||
|             schema: self.schema, | ||||
|             document_id: self.document_id, | ||||
|             update: self.update, | ||||
|             tokenizer_builder: self.tokenizer_builder, | ||||
|             stop_words: self.stop_words, | ||||
|             current_key_name: None, | ||||
|         }) | ||||
| @@ -158,7 +153,6 @@ where B: TokenizerBuilder | ||||
|             schema: self.schema, | ||||
|             document_id: self.document_id, | ||||
|             update: self.update, | ||||
|             tokenizer_builder: self.tokenizer_builder, | ||||
|             stop_words: self.stop_words, | ||||
|         }) | ||||
|     } | ||||
| @@ -175,18 +169,15 @@ where B: TokenizerBuilder | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct MapSerializer<'a, 'b, B> { | ||||
| pub struct MapSerializer<'a, 'b> { | ||||
|     pub schema: &'a Schema, | ||||
|     pub document_id: DocumentId, | ||||
|     pub update: &'a mut DocumentUpdate<'b>, | ||||
|     pub tokenizer_builder: &'a B, | ||||
|     pub stop_words: &'a HashSet<String>, | ||||
|     pub current_key_name: Option<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B> | ||||
| where B: TokenizerBuilder | ||||
| { | ||||
| impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { | ||||
|     type Ok = (); | ||||
|     type Error = SerializerError; | ||||
|  | ||||
| @@ -223,7 +214,6 @@ where B: TokenizerBuilder | ||||
|             if props.is_indexed() { | ||||
|                 let serializer = IndexerSerializer { | ||||
|                     update: self.update, | ||||
|                     tokenizer_builder: self.tokenizer_builder, | ||||
|                     document_id: self.document_id, | ||||
|                     attribute: attr, | ||||
|                     stop_words: self.stop_words, | ||||
| @@ -244,17 +234,14 @@ where B: TokenizerBuilder | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct StructSerializer<'a, 'b, B> { | ||||
| pub struct StructSerializer<'a, 'b> { | ||||
|     pub schema: &'a Schema, | ||||
|     pub document_id: DocumentId, | ||||
|     pub update: &'a mut DocumentUpdate<'b>, | ||||
|     pub tokenizer_builder: &'a B, | ||||
|     pub stop_words: &'a HashSet<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B> | ||||
| where B: TokenizerBuilder | ||||
| { | ||||
| impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { | ||||
|     type Ok = (); | ||||
|     type Error = SerializerError; | ||||
|  | ||||
| @@ -274,7 +261,6 @@ where B: TokenizerBuilder | ||||
|             if props.is_indexed() { | ||||
|                 let serializer = IndexerSerializer { | ||||
|                     update: self.update, | ||||
|                     tokenizer_builder: self.tokenizer_builder, | ||||
|                     document_id: self.document_id, | ||||
|                     attribute: attr, | ||||
|                     stop_words: self.stop_words, | ||||
|   | ||||
| @@ -8,7 +8,6 @@ use serde::Serialize; | ||||
| use meilidb_core::write_to_bytes::WriteToBytes; | ||||
| use meilidb_core::data::DocIds; | ||||
| use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; | ||||
| use meilidb_tokenizer::TokenizerBuilder; | ||||
|  | ||||
| use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; | ||||
| use crate::database::serde::serializer::Serializer; | ||||
| @@ -36,21 +35,18 @@ impl Update { | ||||
|         Update { schema, raw_builder: RawUpdateBuilder::new() } | ||||
|     } | ||||
|  | ||||
|     pub fn update_document<T, B>( | ||||
|     pub fn update_document<T>( | ||||
|         &mut self, | ||||
|         document: T, | ||||
|         tokenizer_builder: &B, | ||||
|         stop_words: &HashSet<String>, | ||||
|     ) -> Result<DocumentId, SerializerError> | ||||
|     where T: Serialize, | ||||
|           B: TokenizerBuilder, | ||||
|     { | ||||
|         let document_id = self.schema.document_id(&document)?; | ||||
|  | ||||
|         let serializer = Serializer { | ||||
|             schema: &self.schema, | ||||
|             document_id: document_id, | ||||
|             tokenizer_builder: tokenizer_builder, | ||||
|             update: &mut self.raw_builder.document_update(document_id)?, | ||||
|             stop_words: stop_words, | ||||
|         }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user