diff options
| author | Laurenz <laurmaedje@gmail.com> | 2020-01-21 17:09:31 +0100 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2020-01-21 17:09:31 +0100 |
| commit | 78da2bdd5d77d1b8572e5e9da119bfa68127a3fa (patch) | |
| tree | 020c8c39268690d34226eb7e33e75f86304988d6 /src/syntax/tokens.rs | |
| parent | 1c1c994c46f7dc30ee34dbc99b02f2342c4617f3 (diff) | |
Decoupled function parser 🔗 [WIP]
Diffstat (limited to 'src/syntax/tokens.rs')
| -rw-r--r-- | src/syntax/tokens.rs | 180 |
1 files changed, 82 insertions, 98 deletions
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index 0588bc6c..6c8e736c 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -4,7 +4,7 @@ use unicode_xid::UnicodeXID; use super::*; use Token::*; -use State::*; +use TokenizationMode::*; /// A minimal semantic entity of source code. @@ -12,20 +12,20 @@ use State::*; pub enum Token<'s> { /// One or more whitespace characters. The contained `usize` denotes the /// number of newlines that were contained in the whitespace. - Whitespace(usize), + Space(usize), /// A line comment with inner string contents `//<&'s str>\n`. LineComment(&'s str), /// A block comment with inner string contents `/*<&'s str>*/`. The comment /// can contain nested block comments. BlockComment(&'s str), - /// An erroneous `*/` without an opening block comment. - StarSlash, - /// A left bracket: `[`. - LeftBracket, - /// A right bracket: `]`. - RightBracket, + /// A function invocation `[<header>][<body>]`. + Function { + header: &'s str, + body: Option<(Position, &'s str)>, + terminated: bool, + }, /// A left parenthesis in a function header: `(`. LeftParen, @@ -46,7 +46,7 @@ pub enum Token<'s> { /// An identifier in a function header: `center`. ExprIdent(&'s str), /// A quoted string in a function header: `"..."`. - ExprStr(&'s str), + ExprStr { string: &'s str, terminated: bool }, /// A number in a function header: `3.14`. ExprNumber(f64), /// A size in a function header: `12pt`. @@ -63,36 +63,31 @@ pub enum Token<'s> { /// Any other consecutive string. Text(&'s str), -} -/// Decomposes text into a sequence of semantic tokens. -pub fn tokenize(start: Position, src: &str) -> Tokens { - Tokens::new(start, src) + /// Things that are not valid in the context they appeared in. + Invalid(&'s str), } /// An iterator over the tokens of a string of source code. pub struct Tokens<'s> { src: &'s str, - state: State, - stack: Vec<(State, Position)>, + mode: TokenizationMode, iter: Peekable<Chars<'s>>, position: Position, index: usize, } #[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum State { +pub enum TokenizationMode { Header, - StartBody, Body, } impl<'s> Tokens<'s> { - pub fn new(start: Position, src: &'s str) -> Tokens<'s> { + pub fn new(start: Position, src: &'s str, mode: TokenizationMode) -> Tokens<'s> { Tokens { src, - state: State::Body, - stack: vec![], + mode, iter: src.chars().peekable(), position: start, index: 0, @@ -110,35 +105,6 @@ impl<'s> Tokens<'s> { pub fn pos(&self) -> Position { self.position } - - /// Move through the string until an unbalanced closing bracket is found - /// without tokenizing the contents. - /// - /// Returns whether a closing bracket was found or the end of the string was - /// reached. - pub fn move_to_closing_bracket(&mut self) -> bool { - let mut escaped = false; - let mut depth = 0; - - self.read_string_until(|n| { - match n { - '[' if !escaped => depth += 1, - ']' if !escaped => { - if depth == 0 { - return true; - } else { - depth -= 1; - } - } - '\\' => escaped = !escaped, - _ => escaped = false, - } - - false - }, false, 0, 0); - - self.peek() == Some(']') - } } impl<'s> Iterator for Tokens<'s> { @@ -153,55 +119,31 @@ impl<'s> Iterator for Tokens<'s> { // Comments. '/' if self.peek() == Some('/') => self.parse_line_comment(), '/' if self.peek() == Some('*') => self.parse_block_comment(), - '*' if self.peek() == Some('/') => { self.eat(); StarSlash } + '*' if self.peek() == Some('/') => { self.eat(); Invalid("*/") } // Whitespace. c if c.is_whitespace() => self.parse_whitespace(start), // Functions. - '[' => { - match self.state { - Header | Body => { - self.stack.push((self.state, start)); - self.position = Position::new(0, '['.len_utf8()); - self.state = Header; - } - StartBody => self.state = Body, - } - - LeftBracket - } - ']' => { - if self.state == Header && self.peek() == Some('[') { - self.state = StartBody; - } else { - if let Some((state, pos)) = self.stack.pop() { - self.state = state; - self.position = pos + self.position; - } else { - self.state = Body; - } - } - - RightBracket - } + '[' => self.parse_function(start), + ']' => Invalid("]"), // Syntactic elements in function headers. - '(' if self.state == Header => LeftParen, - ')' if self.state == Header => RightParen, - '{' if self.state == Header => LeftBrace, - '}' if self.state == Header => RightBrace, - ':' if self.state == Header => Colon, - ',' if self.state == Header => Comma, - '=' if self.state == Header => Equals, + '(' if self.mode == Header => LeftParen, + ')' if self.mode == Header => RightParen, + '{' if self.mode == Header => LeftBrace, + '}' if self.mode == Header => RightBrace, + ':' if self.mode == Header => Colon, + ',' if self.mode == Header => Comma, + '=' if self.mode == Header => Equals, // String values. - '"' if self.state == Header => self.parse_string(), + '"' if self.mode == Header => self.parse_string(), // Style toggles. - '*' if self.state == Body => Star, - '_' if self.state == Body => Underscore, - '`' if self.state == Body => Backtick, + '*' if self.mode == Body => Star, + '_' if self.mode == Body => Underscore, + '`' if self.mode == Body => Backtick, // An escaped thing. '\\' => self.parse_escaped(), @@ -215,9 +157,9 @@ impl<'s> Iterator for Tokens<'s> { ',' | '"' | '/' => true, _ => false, } - }, false, -(c.len_utf8() as isize), 0); + }, false, -(c.len_utf8() as isize), 0).0; - if self.state == Header { + if self.mode == Header { self.parse_expr(text) } else { Text(text) @@ -234,7 +176,7 @@ impl<'s> Iterator for Tokens<'s> { impl<'s> Tokens<'s> { fn parse_line_comment(&mut self) -> Token<'s> { - LineComment(self.read_string_until(is_newline_char, false, 1, 0)) + LineComment(self.read_string_until(is_newline_char, false, 1, 0).0) } fn parse_block_comment(&mut self) -> Token<'s> { @@ -262,19 +204,60 @@ impl<'s> Tokens<'s> { } false - }, true, 0, -2)) + }, true, 0, -2).0) } fn parse_whitespace(&mut self, start: Position) -> Token<'s> { self.read_string_until(|n| !n.is_whitespace(), false, 0, 0); let end = self.pos(); - Whitespace(end.line - start.line) + Space(end.line - start.line) + } + + fn parse_function(&mut self, start: Position) -> Token<'s> { + let (header, terminated) = self.read_function_part(); + self.eat(); + + if self.peek() != Some('[') { + return Function { header, body: None, terminated }; + } + + self.eat(); + + let offset = self.pos() - start; + let (body, terminated) = self.read_function_part(); + self.eat(); + + Function { header, body: Some((offset, body)), terminated } + } + + fn read_function_part(&mut self) -> (&'s str, bool) { + let mut escaped = false; + let mut in_string = false; + let mut depth = 0; + + self.read_string_until(|n| { + match n { + '"' if !escaped => in_string = !in_string, + '[' if !escaped && !in_string => depth += 1, + ']' if !escaped && !in_string => { + if depth == 0 { + return true; + } else { + depth -= 1; + } + } + '\\' => escaped = !escaped, + _ => escaped = false, + } + + false + }, false, 0, 0) } fn parse_string(&mut self) -> Token<'s> { let mut escaped = false; - ExprStr(self.read_string_until(|n| { + let (string, terminated) = self.read_string_until(|n| { match n { '"' if !escaped => return true, '\\' => escaped = !escaped, @@ -282,7 +265,8 @@ impl<'s> Tokens<'s> { } false - }, true, 0, -1)) + }, true, 0, -1); + ExprStr { string, terminated } } fn parse_escaped(&mut self) -> Token<'s> { @@ -294,7 +278,7 @@ impl<'s> Tokens<'s> { } let c = self.peek().unwrap_or('n'); - if self.state == Body && is_escapable(c) { + if self.mode == Body && is_escapable(c) { let index = self.index(); self.eat(); Text(&self.src[index .. index + c.len_utf8()]) @@ -315,7 +299,7 @@ impl<'s> Tokens<'s> { } else if is_identifier(text) { ExprIdent(text) } else { - Text(text) + Invalid(text) } } @@ -325,7 +309,7 @@ impl<'s> Tokens<'s> { eat_match: bool, offset_start: isize, offset_end: isize, - ) -> &'s str where F: FnMut(char) -> bool { + ) -> (&'s str, bool) where F: FnMut(char) -> bool { let start = ((self.index() as isize) + offset_start) as usize; let mut matched = false; @@ -346,7 +330,7 @@ impl<'s> Tokens<'s> { end = ((end as isize) + offset_end) as usize; } - &self.src[start .. end] + (&self.src[start .. end], matched) } fn eat(&mut self) -> Option<char> { |
