mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 09:11:00 +00:00
Use Charabia in milli
This commit is contained in:
@ -3,8 +3,7 @@ use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use meilisearch_tokenizer::token::SeparatorKind;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
||||
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
@ -40,11 +39,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut field_buffer = String::new();
|
||||
let mut config = AnalyzerConfig::default();
|
||||
let mut builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
config.stop_words(stop_words);
|
||||
builder.stop_words(stop_words);
|
||||
}
|
||||
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
@ -64,12 +63,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||
let analyzed = analyzer.analyze(field);
|
||||
let tokens = process_tokens(analyzed.tokens())
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
let token = token.text().trim();
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(token.as_bytes());
|
||||
@ -146,7 +144,7 @@ fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
|
@ -1,8 +1,8 @@
|
||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use charabia::{Tokenizer, TokenizerBuilder};
|
||||
use itertools::Itertools;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
fn update_synonyms(&mut self) -> Result<bool> {
|
||||
match self.synonyms {
|
||||
Setting::Set(ref synonyms) => {
|
||||
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> {
|
||||
analyzer
|
||||
.analyze(text)
|
||||
.tokens()
|
||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
|
||||
tokenizer
|
||||
.tokenize(text)
|
||||
.filter_map(|token| {
|
||||
if token.is_word() {
|
||||
Some(token.text().to_string())
|
||||
Some(token.lemma().to_string())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
let mut config = AnalyzerConfig::default();
|
||||
let mut builder = TokenizerBuilder::new();
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
if let Some(stop_words) = &stop_words {
|
||||
config.stop_words(stop_words);
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
builder.stop_words(stop_words);
|
||||
}
|
||||
let analyzer = Analyzer::new(config);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let mut new_synonyms = HashMap::new();
|
||||
for (word, synonyms) in synonyms {
|
||||
// Normalize both the word and associated synonyms.
|
||||
let normalized_word = normalize(&analyzer, word);
|
||||
let normalized_word = normalize(&tokenizer, word);
|
||||
let normalized_synonyms =
|
||||
synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
|
||||
synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
|
||||
|
||||
// Store the normalized synonyms under the normalized word,
|
||||
// merging the possible duplicate words.
|
||||
@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
||||
fn update_exact_words(&mut self) -> Result<()> {
|
||||
match self.exact_words {
|
||||
Setting::Set(ref mut words) => {
|
||||
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String {
|
||||
analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect()
|
||||
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
|
||||
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
|
||||
}
|
||||
|
||||
let mut config = AnalyzerConfig::default();
|
||||
let mut builder = TokenizerBuilder::new();
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
if let Some(stop_words) = &stop_words {
|
||||
config.stop_words(stop_words);
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
builder.stop_words(stop_words);
|
||||
}
|
||||
let analyzer = Analyzer::new(config);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let mut words: Vec<_> =
|
||||
words.iter().map(|word| normalize(&analyzer, word)).collect();
|
||||
words.iter().map(|word| normalize(&tokenizer, word)).collect();
|
||||
|
||||
// normalization could reorder words
|
||||
words.sort_unstable();
|
||||
|
Reference in New Issue
Block a user