3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish

# Pull Request

Update Charabia:
- enhance Japanese segmentation
- enhance Latin Tokenization
  - words containing `_` are now properly segmented into several words
  - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule
- fixes #3815
- fixes #3778
- fixes [product#151](https://github.com/meilisearch/product/discussions/151)

> Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22`

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2023-06-29 15:24:36 +00:00
committed by GitHub
14 changed files with 252 additions and 187 deletions

View File

@ -256,7 +256,8 @@ pub(crate) mod tests {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn);
let tokenizer = TokenizerBuilder::new().build();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize("split this world");
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, query_terms);

View File

@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
/// Structure used to build a Matcher allowing to customize formating tags.
pub struct MatcherBuilder<'a, A> {
pub struct MatcherBuilder<'m> {
matching_words: MatchingWords,
tokenizer: Tokenizer<'a, 'a, A>,
tokenizer: Tokenizer<'m>,
crop_marker: Option<String>,
highlight_prefix: Option<String>,
highlight_suffix: Option<String>,
}
impl<'a, A> MatcherBuilder<'a, A> {
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
impl<'m> MatcherBuilder<'m> {
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
Self {
matching_words,
tokenizer,
@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> {
self
}
pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> {
let crop_marker = match &self.crop_marker {
Some(marker) => marker.as_str(),
None => DEFAULT_CROP_MARKER,
@ -103,17 +103,17 @@ pub struct MatchBounds {
/// Structure used to analize a string, compute words that match,
/// and format the source string, returning a highlighted and cropped sub-string.
pub struct Matcher<'t, 'm, A> {
pub struct Matcher<'t, 'm> {
text: &'t str,
matching_words: &'m MatchingWords,
tokenizer: &'m Tokenizer<'m, 'm, A>,
tokenizer: &'m Tokenizer<'m>,
crop_marker: &'m str,
highlight_prefix: &'m str,
highlight_suffix: &'m str,
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
}
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
impl<'t> Matcher<'t, '_> {
/// Iterates over tokens and save any of them that matches the query.
fn compute_matches(&mut self) -> &mut Self {
/// some words are counted as matches only if they are close together and in the good order,
@ -503,7 +503,7 @@ mod tests {
use crate::index::tests::TempIndex;
use crate::{execute_search, SearchContext};
impl<'a> MatcherBuilder<'a, &[u8]> {
impl<'a> MatcherBuilder<'a> {
fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
let mut ctx = SearchContext::new(index, rtxn);
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
@ -530,7 +530,7 @@ mod tests {
None => MatchingWords::default(),
};
MatcherBuilder::new(matching_words, TokenizerBuilder::new().build())
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
}
}
@ -690,7 +690,7 @@ mod tests {
// should crop the phrase instead of croping around the match.
insta::assert_snapshot!(
matcher.format(format_options),
@" Split The World is a book written by Emily Henry…"
@"…Split The World is a book written by Emily Henry…"
);
// Text containing some matches.

View File

@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH};
/// Convert the tokenised search query into a list of located query terms.
pub fn located_query_terms_from_tokens(
ctx: &mut SearchContext,
query: NormalizedTokenIter<&[u8]>,
query: NormalizedTokenIter,
words_limit: Option<usize>,
) -> Result<Vec<LocatedQueryTerm>> {
let nbr_typos = number_of_typos_allowed(ctx)?;
@ -303,7 +303,8 @@ mod tests {
#[test]
fn start_with_hard_separator() -> Result<()> {
let tokenizer = TokenizerBuilder::new().build();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let tokens = tokenizer.tokenize(".");
let index = temp_index_with_documents();
let rtxn = index.read_txn()?;