mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-26 00:01:00 +00:00
feat: Move query splitting into the tokenizer workspace
This commit is contained in:
@ -5,4 +5,4 @@ authors = ["Clément Renault <renault.cle@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
|
||||
slice-group-by = "0.2.4"
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::mem;
|
||||
use slice_group_by::LinearStrGroupBy;
|
||||
use self::Separator::*;
|
||||
|
||||
pub fn is_cjk(c: char) -> bool {
|
||||
@ -13,6 +14,33 @@ pub fn is_cjk(c: char) -> bool {
|
||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum CharCategory {
|
||||
Space,
|
||||
Cjk,
|
||||
Other,
|
||||
}
|
||||
|
||||
fn classify_char(c: char) -> CharCategory {
|
||||
if c.is_whitespace() { CharCategory::Space }
|
||||
else if is_cjk(c) { CharCategory::Cjk }
|
||||
else { CharCategory::Other }
|
||||
}
|
||||
|
||||
fn is_word(s: &&str) -> bool {
|
||||
!s.chars().any(char::is_whitespace)
|
||||
}
|
||||
|
||||
fn same_group_category(a: char, b: char) -> bool {
|
||||
let ca = classify_char(a);
|
||||
let cb = classify_char(b);
|
||||
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
|
||||
}
|
||||
|
||||
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
|
||||
LinearStrGroupBy::new(query, same_group_category).filter(is_word)
|
||||
}
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
||||
}
|
||||
|
Reference in New Issue
Block a user