1 files changed, 465 insertions, 0 deletions
diff --git a/src/parsing/tokens.rs b/src/parsing/tokens.rs
new file mode 100644
index 00000000..74b9c11c
--- /dev/null
+++ b/src/parsing/tokens.rs
@@ -0,0 +1,465 @@
+//! Tokenization of text.
+
+use std::str::CharIndices;
+use smallvec::SmallVec;
+use crate::syntax::*;
+
+
+/// Builds an iterator over the tokens of the source code.
+#[inline]
+pub fn tokenize(src: &str) -> Tokens {
+    Tokens::new(src)
+}
+
+/// An iterator over the tokens of source code.
+#[derive(Debug, Clone)]
+pub struct Tokens<'s> {
+    src: &'s str,
+    pub(in super) chars: PeekableChars<'s>,
+    state: TokensState,
+    stack: SmallVec<[TokensState; 1]>,
+}
+
+/// The state the tokenizer is in.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum TokensState {
+    /// The base state if there is nothing special we are in.
+    Body,
+    /// Inside a function header. Here colons and equal signs get parsed
+    /// as distinct tokens rather than text.
+    Function,
+    /// We expect either the end of the function or the beginning of the body.
+    MaybeBody,
+}
+
+impl<'s> Tokens<'s> {
+    /// Create a new token stream from source code.
+    fn new(src: &'s str) -> Tokens<'s> {
+        Tokens {
+            src,
+            chars: PeekableChars::new(src),
+            state: TokensState::Body,
+            stack: SmallVec::new(),
+        }
+    }
+
+    /// Advance the iterator by one step.
+    fn advance(&mut self) {
+        self.chars.next();
+    }
+
+    /// Switch to the given state.
+    fn switch(&mut self, state: TokensState) {
+        self.stack.push(self.state);
+        self.state = state;
+    }
+
+    /// Go back to the top-of-stack state.
+    fn unswitch(&mut self) {
+         self.state = self.stack.pop().unwrap_or(TokensState::Body);
+    }
+
+    /// Advance and return the given token.
+    fn consumed(&mut self, token: Token<'s>) -> Token<'s> {
+        self.advance();
+        token
+    }
+
+    /// Returns a word containing the string bounded by the given indices.
+    fn text(&self, start: usize, end: usize) -> Token<'s> {
+        Token::Text(&self.src[start .. end])
+    }
+}
+
+impl<'s> Iterator for Tokens<'s> {
+    type Item = Token<'s>;
+
+    /// Advance the iterator, return the next token or nothing.
+    fn next(&mut self) -> Option<Token<'s>> {
+        use TokensState as TU;
+
+        // Go to the body state if the function has a body or return to the top-of-stack state.
+        if self.state == TU::MaybeBody {
+            if self.chars.peek()?.1 == '[' {
+                self.state = TU::Body;
+                return Some(self.consumed(Token::LeftBracket));
+            } else {
+                self.unswitch();
+            }
+        }
+
+        // Take the next char and peek at the one behind.
+        let (next_pos, next) = self.chars.next()?;
+        let afterwards = self.chars.peek().map(|p| p.1);
+
+        Some(match next {
+            // Functions
+            '[' => {
+                self.switch(TU::Function);
+                Token::LeftBracket
+            },
+            ']' => {
+                if self.state == TU::Function {
+                    self.state = TU::MaybeBody;
+                } else {
+                    self.unswitch();
+                }
+                Token::RightBracket
+            },
+
+            // Line comment
+            '/' if afterwards == Some('/') => {
+                let mut end = self.chars.next().unwrap();
+                let start = end.0 + end.1.len_utf8();
+
+                while let Some((index, c)) = self.chars.peek() {
+                    if is_newline_char(c) {
+                        break;
+                    }
+                    self.advance();
+                    end = (index, c);
+                }
+
+                let end = end.0 + end.1.len_utf8();
+                Token::LineComment(&self.src[start .. end])
+            },
+
+            // Block comment
+            '/' if afterwards == Some('*') => {
+                let mut end = self.chars.next().unwrap();
+                let start = end.0 + end.1.len_utf8();
+
+                let mut nested = 0;
+                while let Some((index, c)) = self.chars.next() {
+                    let after = self.chars.peek().map(|p| p.1);
+                    match (c, after) {
+                        ('*', Some('/')) if nested == 0 => { self.advance(); break },
+                        ('/', Some('*')) => { self.advance(); nested += 1 },
+                        ('*', Some('/')) => { self.advance(); nested -= 1 },
+                        _ => {},
+                    }
+                    end = (index, c);
+                }
+
+                let end = end.0 + end.1.len_utf8();
+                Token::BlockComment(&self.src[start .. end])
+            },
+
+            // Unexpected end of block comment
+            '*' if afterwards == Some('/') => self.consumed(Token::StarSlash),
+
+            // Whitespace
+            ' ' | '\t' => {
+                while let Some((_, c)) = self.chars.peek() {
+                    match c {
+                        ' ' | '\t' => self.advance(),
+                        _ => break,
+                    }
+                }
+                Token::Space
+            }
+
+            // Newlines
+            '\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
+            c if is_newline_char(c) => Token::Newline,
+
+            // Star/Underscore/Backtick in bodies
+            '*' if self.state == TU::Body => Token::Star,
+            '_' if self.state == TU::Body => Token::Underscore,
+            '`' if self.state == TU::Body => Token::Backtick,
+
+            // Context sensitive operators in headers
+            ':' if self.state == TU::Function => Token::Colon,
+            '=' if self.state == TU::Function => Token::Equals,
+            ',' if self.state == TU::Function => Token::Comma,
+
+            // A string value.
+            '"' if self.state == TU::Function => {
+                // Find out when the word ends.
+                let mut escaped = false;
+                let mut end = (next_pos, next);
+
+                while let Some((index, c)) = self.chars.next() {
+                    if c == '"' && !escaped {
+                        break;
+                    }
+
+                    escaped = c == '\\';
+                    end = (index, c);
+                }
+
+                let end_pos = end.0 + end.1.len_utf8();
+                Token::Quoted(&self.src[next_pos + 1 .. end_pos])
+            }
+
+            // Escaping
+            '\\' => {
+                if let Some((index, c)) = self.chars.peek() {
+                    let escapable = match c {
+                        '[' | ']' | '\\' | '*' | '_' | '`' | ':' | '=' | '/' => true,
+                        _ => false,
+                    };
+
+                    if escapable {
+                        self.advance();
+                        return Some(self.text(index, index + c.len_utf8()));
+                    }
+                }
+
+                Token::Text("\\")
+            },
+
+            // Normal text
+            _ => {
+                // Find out when the word ends.
+                let mut end = (next_pos, next);
+                while let Some((index, c)) = self.chars.peek() {
+                    let second = self.chars.peek_second().map(|p| p.1);
+
+                    // Whether the next token is still from the text or not.
+                    let continues = match c {
+                        '[' | ']' | '\\' => false,
+                        '*' | '_' | '`' if self.state == TU::Body => false,
+                        ':' | '=' | ',' | '"' if self.state == TU::Function => false,
+
+                        '/' => second != Some('/') && second != Some('*'),
+                        '*' => second != Some('/'),
+
+                        ' ' | '\t' => false,
+                        c if is_newline_char(c) => false,
+
+                        _ => true,
+                    };
+
+                    if !continues {
+                        break;
+                    }
+
+                    end = (index, c);
+                    self.advance();
+                }
+
+                let end_pos = end.0 + end.1.len_utf8();
+                self.text(next_pos, end_pos)
+            },
+        })
+    }
+}
+
+/// Whether this character is a newline (or starts one).
+fn is_newline_char(character: char) -> bool {
+    match character {
+        '\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
+        _ => false,
+    }
+}
+
+/// A (index, char) iterator with double lookahead.
+#[derive(Debug, Clone)]
+pub struct PeekableChars<'s> {
+    offset: usize,
+    string: &'s str,
+    chars: CharIndices<'s>,
+    peek1: Option<Option<(usize, char)>>,
+    peek2: Option<Option<(usize, char)>>,
+}
+
+impl<'s> PeekableChars<'s> {
+    /// Create a new iterator from a string.
+    pub fn new(string: &'s str) -> PeekableChars<'s> {
+        PeekableChars {
+            offset: 0,
+            string,
+            chars: string.char_indices(),
+            peek1: None,
+            peek2: None,
+        }
+    }
+
+    /// Peek at the next element.
+    pub fn peek(&mut self) -> Option<(usize, char)> {
+        match self.peek1 {
+            Some(peeked) => peeked,
+            None => {
+                let next = self.next_inner();
+                self.peek1 = Some(next);
+                next
+            }
+        }
+    }
+
+    /// Peek at the element after the next element.
+    pub fn peek_second(&mut self) -> Option<(usize, char)> {
+        match self.peek2 {
+            Some(peeked) => peeked,
+            None => {
+                self.peek();
+                let next = self.next_inner();
+                self.peek2 = Some(next);
+                next
+            }
+        }
+    }
+
+    /// Return the next value of the inner iterator mapped with the offset.
+    pub fn next_inner(&mut self) -> Option<(usize, char)> {
+        self.chars.next().map(|(i, c)| (i + self.offset, c))
+    }
+
+    /// The index of the first character of the next token in the source string.
+    pub fn current_index(&mut self) -> Option<usize> {
+        self.peek().map(|p| p.0)
+    }
+
+    /// Go to a new position in the underlying string.
+    pub fn goto(&mut self, index: usize) {
+        self.offset = index;
+        self.chars = self.string[index..].char_indices();
+        self.peek1 = None;
+        self.peek2 = None;
+    }
+}
+
+impl Iterator for PeekableChars<'_> {
+    type Item = (usize, char);
+
+    fn next(&mut self) -> Option<(usize, char)> {
+        match self.peek1.take() {
+            Some(value) => {
+                self.peek1 = self.peek2.take();
+                value
+            },
+            None => self.next_inner(),
+        }
+    }
+}
+
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use Token::{Space as S, Newline as N, LeftBracket as L, RightBracket as R,
+                Colon as C, Equals as E, Quoted as Q, Underscore as TU, Star as TS,
+                Backtick as TB, Text as T, LineComment as LC, BlockComment as BC,
+                StarSlash as SS};
+
+    /// Test if the source code tokenizes to the tokens.
+    fn test(src: &str, tokens: Vec<Token>) {
+        assert_eq!(Tokens::new(src).collect::<Vec<_>>(), tokens);
+    }
+
+    /// Tokenizes the basic building blocks.
+    #[test]
+    fn tokenize_base() {
+        test("", vec![]);
+        test("Hallo", vec![T("Hallo")]);
+        test("[", vec![L]);
+        test("]", vec![R]);
+        test("*", vec![TS]);
+        test("_", vec![TU]);
+        test("`", vec![TB]);
+        test("\n", vec![N]);
+    }
+
+    /// This test looks if LF- and CRLF-style newlines get both identified correctly.
+    #[test]
+    fn tokenize_whitespace_newlines() {
+        test(" \t", vec![S]);
+        test("First line\r\nSecond line\nThird line\n",
+             vec![T("First"), S, T("line"), N, T("Second"), S, T("line"), N,
+                  T("Third"), S, T("line"), N]);
+        test("Hello \n ", vec![T("Hello"), S, N, S]);
+        test("Dense\nTimes", vec![T("Dense"), N, T("Times")]);
+    }
+
+    /// Tests if escaping with backslash works as it should.
+    #[test]
+    fn tokenize_escape() {
+        test(r"\[", vec![T("[")]);
+        test(r"\]", vec![T("]")]);
+        test(r"\**", vec![T("*"), TS]);
+        test(r"\*", vec![T("*")]);
+        test(r"\__", vec![T("_"), TU]);
+        test(r"\_", vec![T("_")]);
+        test(r"\hello", vec![T("\\"), T("hello")]);
+    }
+
+    /// Tests if escaped strings work.
+    #[test]
+    fn tokenize_quoted() {
+        test(r#"[align: "hello\"world"]"#, vec![L, T("align"), C, S, Q(r#"hello\"world"#), R]);
+    }
+
+    /// Tokenizes some more realistic examples.
+    #[test]
+    fn tokenize_examples() {
+        test(r"
+            [function][
+                Test [italic][example]!
+            ]
+        ", vec![
+            N, S, L, T("function"), R, L, N, S, T("Test"), S, L, T("italic"), R, L,
+            T("example"), R, T("!"), N, S, R, N, S
+        ]);
+
+        test(r"
+            [page: size=A4]
+            [font: size=12pt]
+
+            Das ist ein Beispielsatz mit *fetter* Schrift.
+        ", vec![
+            N, S, L, T("page"), C, S, T("size"), E, T("A4"), R, N, S,
+            L, T("font"), C, S, T("size"), E, T("12pt"), R, N, N, S,
+            T("Das"), S, T("ist"), S, T("ein"), S, T("Beispielsatz"), S, T("mit"), S,
+            TS, T("fetter"), TS, S, T("Schrift."), N, S
+        ]);
+    }
+
+    /// This test checks whether the colon and equals symbols get parsed correctly depending on the
+    /// context: Either in a function header or in a body.
+    #[test]
+    fn tokenize_symbols_context() {
+        test("[func: key=value][Answer: 7]",
+             vec![L, T("func"), C, S, T("key"), E, T("value"), R, L,
+                  T("Answer:"), S, T("7"), R]);
+        test("[[n: k=v]:x][:[=]]:=",
+             vec![L, L, T("n"), C, S, T("k"), E, T("v"), R, C, T("x"), R,
+                  L, T(":"), L, E, R, R, T(":=")]);
+        test("[hi: k=[func][body] v=1][hello]",
+            vec![L, T("hi"), C, S, T("k"), E, L, T("func"), R, L, T("body"), R, S,
+                 T("v"), E, T("1"), R, L, T("hello"), R]);
+        test("[func: __key__=value]",
+             vec![L, T("func"), C, S, T("__key__"), E, T("value"), R]);
+        test("The /*[*/ answer: 7.",
+            vec![T("The"), S, BC("["), S, T("answer:"), S, T("7.")]);
+    }
+
+    /// Test if block and line comments get tokenized as expected.
+    #[test]
+    fn tokenize_comments() {
+        test("These // Line comments.",
+            vec![T("These"), S, LC(" Line comments.")]);
+        test("This /* is */ a comment.",
+            vec![T("This"), S, BC(" is "), S, T("a"), S, T("comment.")]);
+        test("[Head/*of*/][Body]", vec![L, T("Head"), BC("of"), R, L, T("Body"), R]);
+        test("/* Hey */ */", vec![BC(" Hey "), S, SS]);
+        test("Hey\n// Yoo /*\n*/", vec![T("Hey"), N, LC(" Yoo /*"), N, SS]);
+        test("/* My /* line // */ comment */", vec![BC(" My /* line // */ comment ")])
+    }
+
+    /// This test has a special look at the underscore syntax.
+    #[test]
+    fn tokenize_underscores() {
+        test("he_llo_world_ __ Now this_ is_ special!",
+             vec![T("he"), TU, T("llo"), TU, T("world"), TU, S, TU, TU, S, T("Now"), S,
+                  T("this"), TU, S, T("is"), TU, S, T("special!")]);
+    }
+
+    /// This test is for checking if non-ASCII characters get parsed correctly.
+    #[test]
+    fn tokenize_unicode() {
+        test("[document][Hello 🌍!]",
+             vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
+        test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]);
+    }
+}