Come back to the old tokenizer

This commit is contained in:
Clément Renault
2020-08-30 21:50:30 +02:00
committed by Kerollmops
parent 220ba0785c
commit bad0663138
8 changed files with 45 additions and 101 deletions

21
src/tokenizer.rs Normal file
View File

@ -0,0 +1,21 @@
use slice_group_by::StrGroupBy;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
Word,
Space,
}
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
text
.linear_group_by_key(|c| c.is_alphanumeric())
.map(|s| {
let first = s.chars().next().unwrap();
let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
(type_, s)
})
}
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
*t == TokenType::Word
}