mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge #540
540: Integrate charabia r=Kerollmops a=ManyTheFish related to https://github.com/meilisearch/meilisearch/issues/2375 related to https://github.com/meilisearch/meilisearch/issues/2144 related to https://github.com/meilisearch/meilisearch/issues/2417 Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
		| @@ -1,5 +1,5 @@ | |||||||
| use criterion::{criterion_group, criterion_main}; | use criterion::{criterion_group, criterion_main}; | ||||||
| use milli::tokenizer::{Analyzer, AnalyzerConfig}; | use milli::tokenizer::Tokenize; | ||||||
| use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; | use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| @@ -52,9 +52,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { | |||||||
|         for conf in confs { |         for conf in confs { | ||||||
|             group.bench_function(conf.name, |b| { |             group.bench_function(conf.name, |b| { | ||||||
|                 b.iter(|| { |                 b.iter(|| { | ||||||
|                     let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |                     let tokens: Vec<_> = conf.text.tokenize().collect(); | ||||||
|                     let analyzed = analyzer.analyze(&conf.text); |  | ||||||
|                     let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|                     let mut matcher = conf.matching_words.build(&tokens[..], conf.text); |                     let mut matcher = conf.matching_words.build(&tokens[..], conf.text); | ||||||
|                     matcher.format(option.clone()); |                     matcher.format(option.clone()); | ||||||
|                 }) |                 }) | ||||||
|   | |||||||
| @@ -19,7 +19,7 @@ use flate2::read::GzDecoder; | |||||||
| use futures::{stream, FutureExt, StreamExt}; | use futures::{stream, FutureExt, StreamExt}; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use milli::documents::DocumentBatchReader; | use milli::documents::DocumentBatchReader; | ||||||
| use milli::tokenizer::{Analyzer, AnalyzerConfig}; | use milli::tokenizer::{Tokenizer, TokenizerBuilder}; | ||||||
| use milli::update::UpdateIndexingStep::*; | use milli::update::UpdateIndexingStep::*; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
|     ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, |     ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, | ||||||
| @@ -139,17 +139,16 @@ pub struct IndexerOpt { | |||||||
|     pub max_positions_per_attributes: Option<u32>, |     pub max_positions_per_attributes: Option<u32>, | ||||||
| } | } | ||||||
|  |  | ||||||
| struct Highlighter<'a, A> { | struct Highlighter<'s, A> { | ||||||
|     analyzer: Analyzer<'a, A>, |     tokenizer: Tokenizer<'s, A>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { | ||||||
|     fn new(stop_words: &'a fst::Set<A>) -> Self { |     fn new(stop_words: &'s fst::Set<A>) -> Self { | ||||||
|         let mut config = AnalyzerConfig::default(); |         let mut builder = TokenizerBuilder::new(); | ||||||
|         config.stop_words(stop_words); |         builder.stop_words(stop_words); | ||||||
|         let analyzer = Analyzer::new(config); |  | ||||||
|  |  | ||||||
|         Self { analyzer } |         Self { tokenizer: builder.build() } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { |     fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { | ||||||
| @@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { | |||||||
|             Value::Bool(boolean) => Value::Bool(boolean), |             Value::Bool(boolean) => Value::Bool(boolean), | ||||||
|             Value::Number(number) => Value::Number(number), |             Value::Number(number) => Value::Number(number), | ||||||
|             Value::String(old_string) => { |             Value::String(old_string) => { | ||||||
|                 let analyzed = self.analyzer.analyze(&old_string); |                 let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect(); | ||||||
|                 let analyzed: Vec<_> = analyzed.tokens().collect(); |                 let mut matcher = matcher_builder.build(&tokens[..], &old_string); | ||||||
|                 let mut matcher = matcher_builder.build(&analyzed[..], &old_string); |  | ||||||
|  |  | ||||||
|                 let format_options = FormatOptions { highlight: true, crop: Some(10) }; |                 let format_options = FormatOptions { highlight: true, crop: Some(10) }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] } | |||||||
| bincode = "1.3.3" | bincode = "1.3.3" | ||||||
| bstr = "0.2.17" | bstr = "0.2.17" | ||||||
| byteorder = "1.4.3" | byteorder = "1.4.3" | ||||||
|  | charabia = "0.5.0" | ||||||
| concat-arrays = "0.1.2" | concat-arrays = "0.1.2" | ||||||
| crossbeam-channel = "0.5.2" | crossbeam-channel = "0.5.2" | ||||||
| either = "1.6.1" | either = "1.6.1" | ||||||
|  | flatten-serde-json = { path = "../flatten-serde-json" } | ||||||
| fst = "0.4.7" | fst = "0.4.7" | ||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
| flatten-serde-json = { path = "../flatten-serde-json" } |  | ||||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } |  | ||||||
| geoutils = "0.4.1" | geoutils = "0.4.1" | ||||||
|  | grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
| json-depth-checker = { path = "../json-depth-checker" } | json-depth-checker = { path = "../json-depth-checker" } | ||||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||||
| meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } |  | ||||||
| memmap2 = "0.5.3" | memmap2 = "0.5.3" | ||||||
| obkv = "0.2.0" | obkv = "0.2.0" | ||||||
| once_cell = "1.10.0" | once_cell = "1.10.0" | ||||||
|   | |||||||
| @@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition}; | |||||||
| use fxhash::{FxHasher32, FxHasher64}; | use fxhash::{FxHasher32, FxHasher64}; | ||||||
| pub use grenad::CompressionType; | pub use grenad::CompressionType; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
| pub use {heed, meilisearch_tokenizer as tokenizer}; | pub use {charabia as tokenizer, heed}; | ||||||
|  |  | ||||||
| pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | ||||||
| pub use self::criterion::{default_criteria, Criterion, CriterionError}; | pub use self::criterion::{default_criteria, Criterion, CriterionError}; | ||||||
|   | |||||||
| @@ -3,8 +3,8 @@ use std::collections::BTreeMap; | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::ops::{Index, IndexMut}; | use std::ops::{Index, IndexMut}; | ||||||
|  |  | ||||||
|  | use charabia::Token; | ||||||
| use levenshtein_automata::{Distance, DFA}; | use levenshtein_automata::{Distance, DFA}; | ||||||
| use meilisearch_tokenizer::Token; |  | ||||||
|  |  | ||||||
| use crate::search::build_dfa; | use crate::search::build_dfa; | ||||||
|  |  | ||||||
| @@ -99,13 +99,13 @@ impl MatchingWord { | |||||||
|  |  | ||||||
|     /// Returns the lenght in chars of the match in case of the token matches the term. |     /// Returns the lenght in chars of the match in case of the token matches the term. | ||||||
|     pub fn match_token(&self, token: &Token) -> Option<usize> { |     pub fn match_token(&self, token: &Token) -> Option<usize> { | ||||||
|         match self.dfa.eval(token.text()) { |         match self.dfa.eval(token.lemma()) { | ||||||
|             Distance::Exact(t) if t <= self.typo => { |             Distance::Exact(t) if t <= self.typo => { | ||||||
|                 if self.prefix { |                 if self.prefix { | ||||||
|                     let len = bytes_to_highlight(token.text(), &self.word); |                     let len = bytes_to_highlight(token.lemma(), &self.word); | ||||||
|                     Some(token.num_chars_from_bytes(len)) |                     Some(token.original_lengths(len).0) | ||||||
|                 } else { |                 } else { | ||||||
|                     Some(token.num_chars_from_bytes(token.text().len())) |                     Some(token.original_lengths(token.lemma().len()).0) | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             _otherwise => None, |             _otherwise => None, | ||||||
| @@ -262,7 +262,7 @@ mod tests { | |||||||
|     use std::borrow::Cow; |     use std::borrow::Cow; | ||||||
|     use std::str::from_utf8; |     use std::str::from_utf8; | ||||||
|  |  | ||||||
|     use meilisearch_tokenizer::TokenKind; |     use charabia::TokenKind; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::MatchingWords; |     use crate::MatchingWords; | ||||||
| @@ -344,11 +344,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("word"), |                     lemma: Cow::Borrowed("word"), | ||||||
|                     byte_start: 0, |                     char_end: "word".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "word".len(), |                     byte_end: "word".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             Some(MatchType::Full { char_len: 3, ids: &[2] }) |             Some(MatchType::Full { char_len: 3, ids: &[2] }) | ||||||
| @@ -357,11 +356,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("nyc"), |                     lemma: Cow::Borrowed("nyc"), | ||||||
|                     byte_start: 0, |                     char_end: "nyc".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "nyc".len(), |                     byte_end: "nyc".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             None |             None | ||||||
| @@ -370,11 +368,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("world"), |                     lemma: Cow::Borrowed("world"), | ||||||
|                     byte_start: 0, |                     char_end: "world".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "world".len(), |                     byte_end: "world".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             Some(MatchType::Full { char_len: 5, ids: &[2] }) |             Some(MatchType::Full { char_len: 5, ids: &[2] }) | ||||||
| @@ -383,11 +380,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("splitted"), |                     lemma: Cow::Borrowed("splitted"), | ||||||
|                     byte_start: 0, |                     char_end: "splitted".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "splitted".len(), |                     byte_end: "splitted".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             Some(MatchType::Full { char_len: 5, ids: &[0] }) |             Some(MatchType::Full { char_len: 5, ids: &[0] }) | ||||||
| @@ -396,11 +392,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("thisnew"), |                     lemma: Cow::Borrowed("thisnew"), | ||||||
|                     byte_start: 0, |                     char_end: "thisnew".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "thisnew".len(), |                     byte_end: "thisnew".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             None |             None | ||||||
| @@ -409,11 +404,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("borld"), |                     lemma: Cow::Borrowed("borld"), | ||||||
|                     byte_start: 0, |                     char_end: "borld".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "borld".len(), |                     byte_end: "borld".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             Some(MatchType::Full { char_len: 5, ids: &[2] }) |             Some(MatchType::Full { char_len: 5, ids: &[2] }) | ||||||
| @@ -422,11 +416,10 @@ mod tests { | |||||||
|             matching_words |             matching_words | ||||||
|                 .match_token(&Token { |                 .match_token(&Token { | ||||||
|                     kind: TokenKind::Word, |                     kind: TokenKind::Word, | ||||||
|                     word: Cow::Borrowed("wordsplit"), |                     lemma: Cow::Borrowed("wordsplit"), | ||||||
|                     byte_start: 0, |                     char_end: "wordsplit".chars().count(), | ||||||
|                     char_index: 0, |  | ||||||
|                     byte_end: "wordsplit".len(), |                     byte_end: "wordsplit".len(), | ||||||
|                     char_map: None, |                     ..Default::default() | ||||||
|                 }) |                 }) | ||||||
|                 .next(), |                 .next(), | ||||||
|             Some(MatchType::Full { char_len: 4, ids: &[2] }) |             Some(MatchType::Full { char_len: 4, ids: &[2] }) | ||||||
|   | |||||||
| @@ -1,8 +1,8 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  |  | ||||||
|  | use charabia::{SeparatorKind, Token}; | ||||||
| use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; | use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; | ||||||
| pub use matching_words::{MatchingWord, MatchingWords}; | pub use matching_words::{MatchingWord, MatchingWords}; | ||||||
| use meilisearch_tokenizer::token::{SeparatorKind, Token}; |  | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
|  |  | ||||||
| pub mod matching_words; | pub mod matching_words; | ||||||
| @@ -168,13 +168,13 @@ impl<'t> Matcher<'t, '_> { | |||||||
|                 let current_token_position = *token_position; |                 let current_token_position = *token_position; | ||||||
|                 let current_word_position = *word_position; |                 let current_word_position = *word_position; | ||||||
|                 *token_position += 1; |                 *token_position += 1; | ||||||
|                 if token.is_separator().is_none() { |                 if !token.is_separator() { | ||||||
|                     *word_position += 1; |                     *word_position += 1; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 Some((current_token_position, current_word_position, token)) |                 Some((current_token_position, current_word_position, token)) | ||||||
|             }) |             }) | ||||||
|             .filter(|(_, _, token)| token.is_separator().is_none()); |             .filter(|(_, _, token)| !token.is_separator()); | ||||||
|  |  | ||||||
|         while let Some((token_position, word_position, word)) = words_positions.next() { |         while let Some((token_position, word_position, word)) = words_positions.next() { | ||||||
|             for match_type in self.matching_words.match_token(word) { |             for match_type in self.matching_words.match_token(word) { | ||||||
| @@ -243,8 +243,8 @@ impl<'t> Matcher<'t, '_> { | |||||||
|         let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); |         let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); | ||||||
|  |  | ||||||
|         while remaining_words > 0 { |         while remaining_words > 0 { | ||||||
|             let before_token = before_tokens.peek().map(|t| t.is_separator()); |             let before_token = before_tokens.peek().map(|t| t.separator_kind()); | ||||||
|             let after_token = after_tokens.peek().map(|t| t.is_separator()); |             let after_token = after_tokens.peek().map(|t| t.separator_kind()); | ||||||
|  |  | ||||||
|             match (before_token, after_token) { |             match (before_token, after_token) { | ||||||
|                 // we can expand both sides. |                 // we can expand both sides. | ||||||
| @@ -470,7 +470,7 @@ impl<'t> Matcher<'t, '_> { | |||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; |     use charabia::Tokenize; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::search::matches::matching_words::MatchingWord; |     use crate::search::matches::matching_words::MatchingWord; | ||||||
| @@ -490,30 +490,26 @@ mod tests { | |||||||
|         let matching_words = matching_words(); |         let matching_words = matching_words(); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: false, crop: None }; |         let format_options = FormatOptions { highlight: false, crop: None }; | ||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
| @@ -524,44 +520,38 @@ mod tests { | |||||||
|         let matching_words = matching_words(); |         let matching_words = matching_words(); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: true, crop: None }; |         let format_options = FormatOptions { highlight: true, crop: None }; | ||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ""); |         assert_eq!(&matcher.format(format_options), ""); | ||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ":-)"); |         assert_eq!(&matcher.format(format_options), ":-)"); | ||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text, because there is no matches. |         // no crop should return complete text, because there is no matches. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); |         assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); | ||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -580,30 +570,26 @@ mod tests { | |||||||
|         let matching_words = MatchingWords::new(matching_words); |         let matching_words = MatchingWords::new(matching_words); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: true, crop: None }; |         let format_options = FormatOptions { highlight: true, crop: None }; | ||||||
|  |  | ||||||
|         // Text containing prefix match. |         // Text containing prefix match. | ||||||
|         let text = "Ŵôřlḑôle"; |         let text = "Ŵôřlḑôle"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle"); |         assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle"); | ||||||
|  |  | ||||||
|         // Text containing unicode match. |         // Text containing unicode match. | ||||||
|         let text = "Ŵôřlḑ"; |         let text = "Ŵôřlḑ"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>"); |         assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>"); | ||||||
|  |  | ||||||
|         // Text containing unicode match. |         // Text containing unicode match. | ||||||
|         let text = "Westfália"; |         let text = "Westfália"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a"); |         assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a"); | ||||||
| @@ -614,28 +600,24 @@ mod tests { | |||||||
|         let matching_words = matching_words(); |         let matching_words = matching_words(); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: false, crop: Some(10) }; |         let format_options = FormatOptions { highlight: false, crop: Some(10) }; | ||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ""); |         assert_eq!(&matcher.format(format_options), ""); | ||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ":-)"); |         assert_eq!(&matcher.format(format_options), ":-)"); | ||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no highlight should return 10 first words with a marker at the end. |         // no highlight should return 10 first words with a marker at the end. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -645,8 +627,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text without any match starting by a separator. |         // Text without any match starting by a separator. | ||||||
|         let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; |         let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no highlight should return 10 first words with a marker at the end. |         // no highlight should return 10 first words with a marker at the end. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -656,19 +637,17 @@ mod tests { | |||||||
|  |  | ||||||
|         // Test phrase propagation |         // Test phrase propagation | ||||||
|         let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; |         let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // should crop the phrase instead of croping around the match. |         // should crop the phrase instead of croping around the match. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &matcher.format(format_options), |             &matcher.format(format_options), | ||||||
|             "…Split The World is a book written by Emily Henry…" |             "… Split The World is a book written by Emily Henry…", | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no highlight should return 10 last words with a marker at the start. |         // no highlight should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -678,8 +657,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // no highlight should return 10 last words with a marker at the start. |         // no highlight should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -689,8 +667,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing a match unordered and a match ordered. |         // Text containing a match unordered and a match ordered. | ||||||
|         let text = "The world split void void void void void void void void void split the world void void"; |         let text = "The world split void void void void void void void void void split the world void void"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -700,8 +677,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing matches with diferent density. |         // Text containing matches with diferent density. | ||||||
|         let text = "split void the void void world void void void void void void void void void void split the world void void"; |         let text = "split void the void void world void void void void void void void void void void split the world void void"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -711,8 +687,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing matches with same word. |         // Text containing matches with same word. | ||||||
|         let text = "split split split split split split void void void void void void void void void void split the world void void"; |         let text = "split split split split split split void void void void void void void void void void split the world void void"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -726,28 +701,24 @@ mod tests { | |||||||
|         let matching_words = matching_words(); |         let matching_words = matching_words(); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: true, crop: Some(10) }; |         let format_options = FormatOptions { highlight: true, crop: Some(10) }; | ||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ""); |         assert_eq!(&matcher.format(format_options), ""); | ||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!(&matcher.format(format_options), ":-)"); |         assert_eq!(&matcher.format(format_options), ":-)"); | ||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // both should return 10 first words with a marker at the end. |         // both should return 10 first words with a marker at the end. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -757,8 +728,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // both should return 10 last words with a marker at the start and highlighted matches. |         // both should return 10 last words with a marker at the start and highlighted matches. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -768,16 +738,14 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // both should return 10 last words with a marker at the start and highlighted matches. |         // both should return 10 last words with a marker at the start and highlighted matches. | ||||||
|         assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); |         assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); | ||||||
|  |  | ||||||
|         // Text containing a match unordered and a match ordered. |         // Text containing a match unordered and a match ordered. | ||||||
|         let text = "The world split void void void void void void void void void split the world void void"; |         let text = "The world split void void void void void void void void void split the world void void"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
| @@ -792,11 +760,9 @@ mod tests { | |||||||
|         let matching_words = matching_words(); |         let matching_words = matching_words(); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::from_matching_words(matching_words); |         let builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let text = "void void split the world void void."; |         let text = "void void split the world void void."; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|  |  | ||||||
|         // set a smaller crop size |         // set a smaller crop size | ||||||
|         let format_options = FormatOptions { highlight: false, crop: Some(2) }; |         let format_options = FormatOptions { highlight: false, crop: Some(2) }; | ||||||
| @@ -847,13 +813,11 @@ mod tests { | |||||||
|         let mut builder = MatcherBuilder::from_matching_words(matching_words); |         let mut builder = MatcherBuilder::from_matching_words(matching_words); | ||||||
|         builder.highlight_prefix("_".to_string()); |         builder.highlight_prefix("_".to_string()); | ||||||
|         builder.highlight_suffix("_".to_string()); |         builder.highlight_suffix("_".to_string()); | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |  | ||||||
|  |  | ||||||
|         let format_options = FormatOptions { highlight: true, crop: None }; |         let format_options = FormatOptions { highlight: true, crop: None }; | ||||||
|  |  | ||||||
|         let text = "the do or die can't be he do and or isn't he"; |         let text = "the do or die can't be he do and or isn't he"; | ||||||
|         let analyzed = analyzer.analyze(&text); |         let tokens: Vec<_> = text.tokenize().collect(); | ||||||
|         let tokens: Vec<_> = analyzed.tokens().collect(); |  | ||||||
|  |  | ||||||
|         let mut matcher = builder.build(&tokens[..], text); |         let mut matcher = builder.build(&tokens[..], text); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|   | |||||||
| @@ -6,12 +6,12 @@ use std::result::Result as StdResult; | |||||||
| use std::str::Utf8Error; | use std::str::Utf8Error; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  |  | ||||||
|  | use charabia::TokenizerBuilder; | ||||||
| use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; | use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; | ||||||
| use fst::automaton::Str; | use fst::automaton::Str; | ||||||
| use fst::{Automaton, IntoStreamer, Streamer}; | use fst::{Automaton, IntoStreamer, Streamer}; | ||||||
| use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | ||||||
| use log::debug; | use log::debug; | ||||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; |  | ||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use roaring::bitmap::RoaringBitmap; | use roaring::bitmap::RoaringBitmap; | ||||||
|  |  | ||||||
| @@ -126,14 +126,14 @@ impl<'a> Search<'a> { | |||||||
|                 builder.words_limit(self.words_limit); |                 builder.words_limit(self.words_limit); | ||||||
|                 // We make sure that the analyzer is aware of the stop words |                 // We make sure that the analyzer is aware of the stop words | ||||||
|                 // this ensures that the query builder is able to properly remove them. |                 // this ensures that the query builder is able to properly remove them. | ||||||
|                 let mut config = AnalyzerConfig::default(); |                 let mut tokbuilder = TokenizerBuilder::new(); | ||||||
|                 let stop_words = self.index.stop_words(self.rtxn)?; |                 let stop_words = self.index.stop_words(self.rtxn)?; | ||||||
|                 if let Some(ref stop_words) = stop_words { |                 if let Some(ref stop_words) = stop_words { | ||||||
|                     config.stop_words(stop_words); |                     tokbuilder.stop_words(stop_words); | ||||||
|                 } |                 } | ||||||
|                 let analyzer = Analyzer::new(config); |  | ||||||
|                 let result = analyzer.analyze(query); |                 let tokenizer = tokbuilder.build(); | ||||||
|                 let tokens = result.tokens(); |                 let tokens = tokenizer.tokenize(query); | ||||||
|                 builder |                 builder | ||||||
|                     .build(tokens)? |                     .build(tokens)? | ||||||
|                     .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) |                     .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::{cmp, fmt, mem}; | use std::{cmp, fmt, mem}; | ||||||
|  |  | ||||||
|  | use charabia::classifier::ClassifiedTokenIter; | ||||||
|  | use charabia::{SeparatorKind, TokenKind}; | ||||||
| use fst::Set; | use fst::Set; | ||||||
| use meilisearch_tokenizer::token::SeparatorKind; |  | ||||||
| use meilisearch_tokenizer::tokenizer::TokenStream; |  | ||||||
| use meilisearch_tokenizer::TokenKind; |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
| @@ -235,9 +234,9 @@ impl<'a> QueryTreeBuilder<'a> { | |||||||
|     /// - if `authorize_typos` is set to `false` the query tree will be generated |     /// - if `authorize_typos` is set to `false` the query tree will be generated | ||||||
|     ///   forcing all query words to match documents without any typo |     ///   forcing all query words to match documents without any typo | ||||||
|     ///   (the criterion `typo` will be ignored) |     ///   (the criterion `typo` will be ignored) | ||||||
|     pub fn build( |     pub fn build<A: AsRef<[u8]>>( | ||||||
|         &self, |         &self, | ||||||
|         query: TokenStream, |         query: ClassifiedTokenIter<A>, | ||||||
|     ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> { |     ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> { | ||||||
|         let stop_words = self.index.stop_words(self.rtxn)?; |         let stop_words = self.index.stop_words(self.rtxn)?; | ||||||
|         let primitive_query = create_primitive_query(query, stop_words, self.words_limit); |         let primitive_query = create_primitive_query(query, stop_words, self.words_limit); | ||||||
| @@ -649,11 +648,14 @@ impl PrimitiveQueryPart { | |||||||
|  |  | ||||||
| /// Create primitive query from tokenized query string, | /// Create primitive query from tokenized query string, | ||||||
| /// the primitive query is an intermediate state to build the query tree. | /// the primitive query is an intermediate state to build the query tree. | ||||||
| fn create_primitive_query( | fn create_primitive_query<A>( | ||||||
|     query: TokenStream, |     query: ClassifiedTokenIter<A>, | ||||||
|     stop_words: Option<Set<&[u8]>>, |     stop_words: Option<Set<&[u8]>>, | ||||||
|     words_limit: Option<usize>, |     words_limit: Option<usize>, | ||||||
| ) -> PrimitiveQuery { | ) -> PrimitiveQuery | ||||||
|  | where | ||||||
|  |     A: AsRef<[u8]>, | ||||||
|  | { | ||||||
|     let mut primitive_query = Vec::new(); |     let mut primitive_query = Vec::new(); | ||||||
|     let mut phrase = Vec::new(); |     let mut phrase = Vec::new(); | ||||||
|     let mut quoted = false; |     let mut quoted = false; | ||||||
| @@ -673,21 +675,18 @@ fn create_primitive_query( | |||||||
|                 // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, |                 // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, | ||||||
|                 // 3. if the word is the last token of the query we push it as a prefix word. |                 // 3. if the word is the last token of the query we push it as a prefix word. | ||||||
|                 if quoted { |                 if quoted { | ||||||
|                     phrase.push(token.word.to_string()); |                     phrase.push(token.lemma().to_string()); | ||||||
|                 } else if peekable.peek().is_some() { |                 } else if peekable.peek().is_some() { | ||||||
|                     if !stop_words |                     if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { | ||||||
|                         .as_ref() |  | ||||||
|                         .map_or(false, |swords| swords.contains(token.word.as_ref())) |  | ||||||
|                     { |  | ||||||
|                         primitive_query |                         primitive_query | ||||||
|                             .push(PrimitiveQueryPart::Word(token.word.to_string(), false)); |                             .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); | ||||||
|                     } |                     } | ||||||
|                 } else { |                 } else { | ||||||
|                     primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); |                     primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             TokenKind::Separator(separator_kind) => { |             TokenKind::Separator(separator_kind) => { | ||||||
|                 let quote_count = token.word.chars().filter(|&s| s == '"').count(); |                 let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); | ||||||
|                 // swap quoted state if we encounter a double quote |                 // swap quoted state if we encounter a double quote | ||||||
|                 if quote_count % 2 != 0 { |                 if quote_count % 2 != 0 { | ||||||
|                     quoted = !quoted; |                     quoted = !quoted; | ||||||
| @@ -738,8 +737,8 @@ pub fn maximum_proximity(operation: &Operation) -> usize { | |||||||
| mod test { | mod test { | ||||||
|     use std::collections::HashMap; |     use std::collections::HashMap; | ||||||
|  |  | ||||||
|  |     use charabia::Tokenize; | ||||||
|     use maplit::hashmap; |     use maplit::hashmap; | ||||||
|     use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; |  | ||||||
|     use rand::rngs::StdRng; |     use rand::rngs::StdRng; | ||||||
|     use rand::{Rng, SeedableRng}; |     use rand::{Rng, SeedableRng}; | ||||||
|  |  | ||||||
| @@ -754,12 +753,12 @@ mod test { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     impl TestContext { |     impl TestContext { | ||||||
|         fn build( |         fn build<A: AsRef<[u8]>>( | ||||||
|             &self, |             &self, | ||||||
|             optional_words: bool, |             optional_words: bool, | ||||||
|             authorize_typos: bool, |             authorize_typos: bool, | ||||||
|             words_limit: Option<usize>, |             words_limit: Option<usize>, | ||||||
|             query: TokenStream, |             query: ClassifiedTokenIter<A>, | ||||||
|         ) -> Result<Option<(Operation, PrimitiveQuery)>> { |         ) -> Result<Option<(Operation, PrimitiveQuery)>> { | ||||||
|             let primitive_query = create_primitive_query(query, None, words_limit); |             let primitive_query = create_primitive_query(query, None, words_limit); | ||||||
|             if !primitive_query.is_empty() { |             if !primitive_query.is_empty() { | ||||||
| @@ -856,9 +855,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn prefix() { |     fn prefix() { | ||||||
|         let query = "hey friends"; |         let query = "hey friends"; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -889,9 +886,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn no_prefix() { |     fn no_prefix() { | ||||||
|         let query = "hey friends "; |         let query = "hey friends "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -922,9 +917,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn synonyms() { |     fn synonyms() { | ||||||
|         let query = "hello world "; |         let query = "hello world "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -987,9 +980,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn complex_synonyms() { |     fn complex_synonyms() { | ||||||
|         let query = "new york city "; |         let query = "new york city "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -1087,9 +1078,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn ngrams() { |     fn ngrams() { | ||||||
|         let query = "n grams "; |         let query = "n grams "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -1120,9 +1109,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn word_split() { |     fn word_split() { | ||||||
|         let query = "wordsplit fish "; |         let query = "wordsplit fish "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -1159,9 +1146,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn phrase() { |     fn phrase() { | ||||||
|         let query = "\"hey friends\" \" \" \"wooop"; |         let query = "\"hey friends\" \" \" \"wooop"; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::And(vec![ |         let expected = Operation::And(vec![ | ||||||
|             Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), |             Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), | ||||||
| @@ -1177,9 +1162,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn phrase_with_hard_separator() { |     fn phrase_with_hard_separator() { | ||||||
|         let query = "\"hey friends. wooop wooop\""; |         let query = "\"hey friends. wooop wooop\""; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::And(vec![ |         let expected = Operation::And(vec![ | ||||||
|             Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), |             Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), | ||||||
| @@ -1195,9 +1178,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn optional_word() { |     fn optional_word() { | ||||||
|         let query = "hey my friend "; |         let query = "hey my friend "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             true, |             true, | ||||||
| @@ -1280,9 +1261,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn optional_word_phrase() { |     fn optional_word_phrase() { | ||||||
|         let query = "\"hey my\""; |         let query = "\"hey my\""; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); |         let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); | ||||||
|         let (query_tree, _) = |         let (query_tree, _) = | ||||||
| @@ -1294,9 +1273,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn optional_word_multiple_phrases() { |     fn optional_word_multiple_phrases() { | ||||||
|         let query = r#""hey" my good "friend""#; |         let query = r#""hey" my good "friend""#; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             true, |             true, | ||||||
| @@ -1365,9 +1342,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn no_typo() { |     fn no_typo() { | ||||||
|         let query = "hey friends "; |         let query = "hey friends "; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::Or( |         let expected = Operation::Or( | ||||||
|             false, |             false, | ||||||
| @@ -1397,9 +1372,7 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn words_limit() { |     fn words_limit() { | ||||||
|         let query = "\"hey my\" good friend"; |         let query = "\"hey my\" good friend"; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|  |  | ||||||
|         let expected = Operation::And(vec![ |         let expected = Operation::And(vec![ | ||||||
|             Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), |             Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), | ||||||
| @@ -1441,10 +1414,8 @@ mod test { | |||||||
|     #[test] |     #[test] | ||||||
|     fn disable_typo_on_word() { |     fn disable_typo_on_word() { | ||||||
|         let query = "goodbye"; |         let query = "goodbye"; | ||||||
|         let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); |         let tokens = query.tokenize(); | ||||||
|         let result = analyzer.analyze(query); |  | ||||||
|  |  | ||||||
|         let tokens = result.tokens(); |  | ||||||
|         let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); |         let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); | ||||||
|         let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); |         let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); | ||||||
|         let context = TestContext { exact_words, ..Default::default() }; |         let context = TestContext { exact_words, ..Default::default() }; | ||||||
|   | |||||||
| @@ -3,8 +3,7 @@ use std::convert::TryInto; | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::{io, mem, str}; | use std::{io, mem, str}; | ||||||
|  |  | ||||||
| use meilisearch_tokenizer::token::SeparatorKind; | use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; | ||||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| @@ -40,11 +39,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     let mut field_buffer = String::new(); |     let mut field_buffer = String::new(); | ||||||
|     let mut config = AnalyzerConfig::default(); |     let mut builder = TokenizerBuilder::new(); | ||||||
|     if let Some(stop_words) = stop_words { |     if let Some(stop_words) = stop_words { | ||||||
|         config.stop_words(stop_words); |         builder.stop_words(stop_words); | ||||||
|     } |     } | ||||||
|     let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default()); |     let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
| @@ -64,12 +63,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; |                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||||
|                 field_buffer.clear(); |                 field_buffer.clear(); | ||||||
|                 if let Some(field) = json_to_string(&value, &mut field_buffer) { |                 if let Some(field) = json_to_string(&value, &mut field_buffer) { | ||||||
|                     let analyzed = analyzer.analyze(field); |                     let tokens = process_tokens(tokenizer.tokenize(field)) | ||||||
|                     let tokens = process_tokens(analyzed.tokens()) |  | ||||||
|                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); |                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||||
|  |  | ||||||
|                     for (index, token) in tokens { |                     for (index, token) in tokens { | ||||||
|                         let token = token.text().trim(); |                         let token = token.lemma().trim(); | ||||||
|                         if !token.is_empty() { |                         if !token.is_empty() { | ||||||
|                             key_buffer.truncate(mem::size_of::<u32>()); |                             key_buffer.truncate(mem::size_of::<u32>()); | ||||||
|                             key_buffer.extend_from_slice(token.as_bytes()); |                             key_buffer.extend_from_slice(token.as_bytes()); | ||||||
| @@ -146,7 +144,7 @@ fn process_tokens<'a>( | |||||||
|     tokens: impl Iterator<Item = Token<'a>>, |     tokens: impl Iterator<Item = Token<'a>>, | ||||||
| ) -> impl Iterator<Item = (usize, Token<'a>)> { | ) -> impl Iterator<Item = (usize, Token<'a>)> { | ||||||
|     tokens |     tokens | ||||||
|         .skip_while(|token| token.is_separator().is_some()) |         .skip_while(|token| token.is_separator()) | ||||||
|         .scan((0, None), |(offset, prev_kind), token| { |         .scan((0, None), |(offset, prev_kind), token| { | ||||||
|             match token.kind { |             match token.kind { | ||||||
|                 TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { |                 TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { | ||||||
|   | |||||||
| @@ -1,8 +1,8 @@ | |||||||
| use std::collections::{BTreeSet, HashMap, HashSet}; | use std::collections::{BTreeSet, HashMap, HashSet}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
|  |  | ||||||
|  | use charabia::{Tokenizer, TokenizerBuilder}; | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; |  | ||||||
| use serde::{Deserialize, Deserializer, Serialize, Serializer}; | use serde::{Deserialize, Deserializer, Serialize, Serializer}; | ||||||
| use time::OffsetDateTime; | use time::OffsetDateTime; | ||||||
|  |  | ||||||
| @@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     fn update_synonyms(&mut self) -> Result<bool> { |     fn update_synonyms(&mut self) -> Result<bool> { | ||||||
|         match self.synonyms { |         match self.synonyms { | ||||||
|             Setting::Set(ref synonyms) => { |             Setting::Set(ref synonyms) => { | ||||||
|                 fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> { |                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> { | ||||||
|                     analyzer |                     tokenizer | ||||||
|                         .analyze(text) |                         .tokenize(text) | ||||||
|                         .tokens() |  | ||||||
|                         .filter_map(|token| { |                         .filter_map(|token| { | ||||||
|                             if token.is_word() { |                             if token.is_word() { | ||||||
|                                 Some(token.text().to_string()) |                                 Some(token.lemma().to_string()) | ||||||
|                             } else { |                             } else { | ||||||
|                                 None |                                 None | ||||||
|                             } |                             } | ||||||
| @@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|                         .collect::<Vec<_>>() |                         .collect::<Vec<_>>() | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 let mut config = AnalyzerConfig::default(); |                 let mut builder = TokenizerBuilder::new(); | ||||||
|                 let stop_words = self.index.stop_words(self.wtxn)?; |                 let stop_words = self.index.stop_words(self.wtxn)?; | ||||||
|                 if let Some(stop_words) = &stop_words { |                 if let Some(ref stop_words) = stop_words { | ||||||
|                     config.stop_words(stop_words); |                     builder.stop_words(stop_words); | ||||||
|                 } |                 } | ||||||
|                 let analyzer = Analyzer::new(config); |                 let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|                 let mut new_synonyms = HashMap::new(); |                 let mut new_synonyms = HashMap::new(); | ||||||
|                 for (word, synonyms) in synonyms { |                 for (word, synonyms) in synonyms { | ||||||
|                     // Normalize both the word and associated synonyms. |                     // Normalize both the word and associated synonyms. | ||||||
|                     let normalized_word = normalize(&analyzer, word); |                     let normalized_word = normalize(&tokenizer, word); | ||||||
|                     let normalized_synonyms = |                     let normalized_synonyms = | ||||||
|                         synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); |                         synonyms.iter().map(|synonym| normalize(&tokenizer, synonym)); | ||||||
|  |  | ||||||
|                     // Store the normalized synonyms under the normalized word, |                     // Store the normalized synonyms under the normalized word, | ||||||
|                     // merging the possible duplicate words. |                     // merging the possible duplicate words. | ||||||
| @@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     fn update_exact_words(&mut self) -> Result<()> { |     fn update_exact_words(&mut self) -> Result<()> { | ||||||
|         match self.exact_words { |         match self.exact_words { | ||||||
|             Setting::Set(ref mut words) => { |             Setting::Set(ref mut words) => { | ||||||
|                 fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { |                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { | ||||||
|                     analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() |                     tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 let mut config = AnalyzerConfig::default(); |                 let mut builder = TokenizerBuilder::new(); | ||||||
|                 let stop_words = self.index.stop_words(self.wtxn)?; |                 let stop_words = self.index.stop_words(self.wtxn)?; | ||||||
|                 if let Some(stop_words) = &stop_words { |                 if let Some(ref stop_words) = stop_words { | ||||||
|                     config.stop_words(stop_words); |                     builder.stop_words(stop_words); | ||||||
|                 } |                 } | ||||||
|                 let analyzer = Analyzer::new(config); |                 let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|                 let mut words: Vec<_> = |                 let mut words: Vec<_> = | ||||||
|                     words.iter().map(|word| normalize(&analyzer, word)).collect(); |                     words.iter().map(|word| normalize(&tokenizer, word)).collect(); | ||||||
|  |  | ||||||
|                 // normalization could reorder words |                 // normalization could reorder words | ||||||
|                 words.sort_unstable(); |                 words.sort_unstable(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user