mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 08:41:00 +00:00
Come back to the old tokenizer
This commit is contained in:
committed by
Kerollmops
parent
220ba0785c
commit
bad0663138
21
src/tokenizer.rs
Normal file
21
src/tokenizer.rs
Normal file
@ -0,0 +1,21 @@
|
||||
use slice_group_by::StrGroupBy;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TokenType {
|
||||
Word,
|
||||
Space,
|
||||
}
|
||||
|
||||
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
|
||||
text
|
||||
.linear_group_by_key(|c| c.is_alphanumeric())
|
||||
.map(|s| {
|
||||
let first = s.chars().next().unwrap();
|
||||
let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
|
||||
(type_, s)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
|
||||
*t == TokenType::Word
|
||||
}
|
Reference in New Issue
Block a user