Cargo fmt pass

2025-10-28 22:46:28 +00:00 · 2019-10-18 13:05:28 +02:00
parent 47d777c8f7
commit ca26a0f2e4
48 changed files with 1599 additions and 979 deletions
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@@ -1,17 +1,17 @@
-use std::iter::Peekable;
-use slice_group_by::StrGroupBy;
 use self::SeparatorCategory::*;
+use slice_group_by::StrGroupBy;
+use std::iter::Peekable;

 pub fn is_cjk(c: char) -> bool {
-    (c >= '\u{2e80}' && c <= '\u{2eff}') ||
-    (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
-    (c >= '\u{3040}' && c <= '\u{309f}') ||
-    (c >= '\u{30a0}' && c <= '\u{30ff}') ||
-    (c >= '\u{3100}' && c <= '\u{312f}') ||
-    (c >= '\u{3200}' && c <= '\u{32ff}') ||
-    (c >= '\u{3400}' && c <= '\u{4dbf}') ||
-    (c >= '\u{4e00}' && c <= '\u{9fff}') ||
-    (c >= '\u{f900}' && c <= '\u{faff}')
+    (c >= '\u{2e80}' && c <= '\u{2eff}')
+        || (c >= '\u{2f00}' && c <= '\u{2fdf}')
+        || (c >= '\u{3040}' && c <= '\u{309f}')
+        || (c >= '\u{30a0}' && c <= '\u{30ff}')
+        || (c >= '\u{3100}' && c <= '\u{312f}')
+        || (c >= '\u{3200}' && c <= '\u{32ff}')
+        || (c >= '\u{3400}' && c <= '\u{4dbf}')
+        || (c >= '\u{4e00}' && c <= '\u{9fff}')
+        || (c >= '\u{f900}' && c <= '\u{faff}')
 }

 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -22,7 +22,11 @@ enum SeparatorCategory {

 impl SeparatorCategory {
    fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
-        if let (Soft, Soft) = (self, other) { Soft } else { Hard }
+        if let (Soft, Soft) = (self, other) {
+            Soft
+        } else {
+            Hard
+        }
    }

    fn to_usize(self) -> usize {
@@ -40,7 +44,7 @@ fn is_separator(c: char) -> bool {
 fn classify_separator(c: char) -> Option<SeparatorCategory> {
    match c {
        ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
-        '.' | ';' | ',' | '!' | '?' |  '(' | ')' => Some(Hard),
+        '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
        _ => None,
    }
 }
@@ -79,7 +83,7 @@ fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, u
    (n + 1, i + c.len_utf8())
 }

-pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
+pub fn split_query_string(query: &str) -> impl Iterator<Item = &str> {
    Tokenizer::new(query).map(|t| t.word)
 }

@@ -100,9 +104,10 @@ impl<'a> Tokenizer<'a> {
    pub fn new(string: &str) -> Tokenizer {
        // skip every separator and set `char_index`
        // to the number of char trimmed
-        let (count, index) = string.char_indices()
-                                   .take_while(|(_, c)| is_separator(*c))
-                                   .fold((0, 0), chars_count_index);
+        let (count, index) = string
+            .char_indices()
+            .take_while(|(_, c)| is_separator(*c))
+            .fold((0, 0), chars_count_index);

        Tokenizer {
            inner: &string[index..],
@@ -122,10 +127,11 @@ impl<'a> Iterator for Tokenizer<'a> {
            let (count, index) = string.char_indices().fold((0, 0), chars_count_index);

            if !is_str_word(string) {
-                self.word_index += string.chars()
-                                         .filter_map(classify_separator)
-                                         .fold(Soft, |a, x| a.merge(x))
-                                         .to_usize();
+                self.word_index += string
+                    .chars()
+                    .filter_map(classify_separator)
+                    .fold(Soft, |a, x| a.merge(x))
+                    .to_usize();
                self.char_index += count;
                self.inner = &self.inner[index..];
                continue;
@@ -153,7 +159,8 @@ impl<'a> Iterator for Tokenizer<'a> {
 }

 pub struct SeqTokenizer<'a, I>
-where I: Iterator<Item=&'a str>,
+where
+    I: Iterator<Item = &'a str>,
 {
    inner: I,
    current: Option<Peekable<Tokenizer<'a>>>,
@@ -162,7 +169,8 @@ where I: Iterator<Item=&'a str>,
 }

 impl<'a, I> SeqTokenizer<'a, I>
-where I: Iterator<Item=&'a str>,
+where
+    I: Iterator<Item = &'a str>,
 {
    pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
        let current = iter.next().map(|s| Tokenizer::new(s).peekable());
@@ -176,7 +184,8 @@ where I: Iterator<Item=&'a str>,
 }

 impl<'a, I> Iterator for SeqTokenizer<'a, I>
-where I: Iterator<Item=&'a str>,
+where
+    I: Iterator<Item = &'a str>,
 {
    type Item = Token<'a>;

@@ -202,15 +211,15 @@ where I: Iterator<Item=&'a str>,
                        }

                        Some(token)
-                    },
+                    }
                    None => {
                        // no more words in this text we must
                        // start tokenizing the next text
                        self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
                        self.next()
-                    },
+                    }
                }
-            },
+            }
            // no more texts available
            None => None,
        }
@@ -225,12 +234,26 @@ mod tests {
    fn easy() {
        let mut tokenizer = Tokenizer::new("salut");

-        assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "salut",
+                word_index: 0,
+                char_index: 0
+            })
+        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo    ");

-        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "yo",
+                word_index: 0,
+                char_index: 0
+            })
+        );
        assert_eq!(tokenizer.next(), None);
    }

@@ -238,19 +261,82 @@ mod tests {
    fn hard() {
        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");

-        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "yo",
+                word_index: 0,
+                char_index: 4
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lolo",
+                word_index: 1,
+                char_index: 7
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "aïe",
+                word_index: 9,
+                char_index: 13
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "ouch",
+                word_index: 17,
+                char_index: 18
+            })
+        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");

-        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "yo",
+                word_index: 0,
+                char_index: 0
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lolo",
+                word_index: 8,
+                char_index: 5
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "wtf",
+                word_index: 16,
+                char_index: 12
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lol",
+                word_index: 17,
+                char_index: 18
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "aïe",
+                word_index: 25,
+                char_index: 24
+            })
+        );
        assert_eq!(tokenizer.next(), None);
    }

@@ -258,18 +344,74 @@ mod tests {
    fn hard_long_chars() {
        let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");

-        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "yo",
+                word_index: 0,
+                char_index: 4
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "😂",
+                word_index: 1,
+                char_index: 7
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "aïe",
+                word_index: 9,
+                char_index: 10
+            })
+        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");

-        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "yo",
+                word_index: 0,
+                char_index: 0
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lolo",
+                word_index: 8,
+                char_index: 5
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "😱",
+                word_index: 16,
+                char_index: 12
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lol",
+                word_index: 17,
+                char_index: 16
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "😣",
+                word_index: 25,
+                char_index: 22
+            })
+        );
        assert_eq!(tokenizer.next(), None);
    }

@@ -277,19 +419,82 @@ mod tests {
    fn hard_kanjis() {
        let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");

-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ec4}",
+                word_index: 0,
+                char_index: 0
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lolilol",
+                word_index: 1,
+                char_index: 1
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ec7}",
+                word_index: 2,
+                char_index: 8
+            })
+        );
        assert_eq!(tokenizer.next(), None);

        let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello    \u{2ec7}");

-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ec4}",
+                word_index: 0,
+                char_index: 0
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ed3}",
+                word_index: 1,
+                char_index: 1
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ef2}",
+                word_index: 2,
+                char_index: 2
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "lolilol",
+                word_index: 3,
+                char_index: 4
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "hello",
+                word_index: 4,
+                char_index: 14
+            })
+        );
+        assert_eq!(
+            tokenizer.next(),
+            Some(Token {
+                word: "\u{2ec7}",
+                word_index: 5,
+                char_index: 23
+            })
+        );
        assert_eq!(tokenizer.next(), None);
    }
 }