Come back to the old tokenizer

2025-07-27 08:41:00 +00:00 · 2020-08-30 21:50:30 +02:00
parent 220ba0785c
commit bad0663138
8 changed files with 45 additions and 101 deletions
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -0,0 +1,21 @@
+use slice_group_by::StrGroupBy;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TokenType {
+    Word,
+    Space,
+}
+
+pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
+    text
+        .linear_group_by_key(|c| c.is_alphanumeric())
+        .map(|s| {
+            let first = s.chars().next().unwrap();
+            let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
+            (type_, s)
+        })
+}
+
+pub fn only_words((t, _): &(TokenType, &str)) -> bool {
+    *t == TokenType::Word
+}