diff options
| author | Laurenz <laurmaedje@gmail.com> | 2020-10-02 15:43:29 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2020-10-02 15:43:29 +0200 |
| commit | 3533268b1f7a31581e7b8f44dff6d4f553ef348f (patch) | |
| tree | 3fee21d2df7ce173131f75f46a1ef040f272ed29 /src/parse/tokens.rs | |
| parent | f8770d2b2a8ac389704897f92f2753398352835b (diff) | |
Refactor parser 🏞
Diffstat (limited to 'src/parse/tokens.rs')
| -rw-r--r-- | src/parse/tokens.rs | 239 |
1 files changed, 122 insertions, 117 deletions
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 9f30f587..72d7b2d9 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,17 +1,19 @@ //! Tokenization. +use std::fmt::{self, Debug, Formatter}; + use super::{is_newline, Scanner}; use crate::length::Length; -use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token}; +use crate::syntax::token::*; +use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned}; use TokenMode::*; /// An iterator over the tokens of a string of source code. -#[derive(Debug)] +#[derive(Clone)] pub struct Tokens<'s> { s: Scanner<'s>, mode: TokenMode, - stack: Vec<TokenMode>, } /// Whether to tokenize in header mode which yields expression, comma and @@ -26,23 +28,17 @@ pub enum TokenMode { impl<'s> Tokens<'s> { /// Create a new token iterator with the given mode. pub fn new(src: &'s str, mode: TokenMode) -> Self { - Self { - s: Scanner::new(src), - mode, - stack: vec![], - } + Self { s: Scanner::new(src), mode } } - /// Change the token mode and push the old one on a stack. - pub fn push_mode(&mut self, mode: TokenMode) { - self.stack.push(self.mode); - self.mode = mode; + /// Get the current token mode. + pub fn mode(&self) -> TokenMode { + self.mode } - /// Pop the old token mode from the stack. This panics if there is no mode - /// on the stack. - pub fn pop_mode(&mut self) { - self.mode = self.stack.pop().expect("no pushed mode"); + /// Change the token mode. + pub fn set_mode(&mut self, mode: TokenMode) { + self.mode = mode; } /// The position in the string at which the last token ends and next token @@ -50,6 +46,16 @@ impl<'s> Tokens<'s> { pub fn pos(&self) -> Pos { self.s.index().into() } + + /// Jump to a position in the source string. + pub fn jump(&mut self, pos: Pos) { + self.s.jump(pos.to_usize()); + } + + /// The underlying scanner. + pub fn scanner(&self) -> &Scanner<'s> { + &self.s + } } impl<'s> Iterator for Tokens<'s> { @@ -59,8 +65,12 @@ impl<'s> Iterator for Tokens<'s> { fn next(&mut self) -> Option<Self::Item> { let start = self.s.index(); let token = match self.s.eat()? { - // Whitespace. - c if c.is_whitespace() => self.read_whitespace(c), + // Whitespace with fast path for just a single space. + ' ' if !self.s.check(|c| c.is_whitespace()) => Token::Space(0), + c if c.is_whitespace() => { + self.s.jump(start); + self.read_whitespace() + } // Comments. '/' if self.s.eat_if('/') => self.read_line_comment(), @@ -76,8 +86,8 @@ impl<'s> Iterator for Tokens<'s> { // Syntactic elements in body text. '*' if self.mode == Body => Token::Star, '_' if self.mode == Body => Token::Underscore, - '`' if self.mode == Body => self.read_raw(), '#' if self.mode == Body => Token::Hashtag, + '`' if self.mode == Body => self.read_raw(), '~' if self.mode == Body => Token::Text("\u{00A0}"), '\\' if self.mode == Body => self.read_escaped(), @@ -88,12 +98,12 @@ impl<'s> Iterator for Tokens<'s> { ',' if self.mode == Header => Token::Comma, '=' if self.mode == Header => Token::Equals, '>' if self.mode == Header && self.s.eat_if('>') => Token::Chain, - - // Expressions in headers. '+' if self.mode == Header => Token::Plus, '-' if self.mode == Header => Token::Hyphen, '*' if self.mode == Header => Token::Star, '/' if self.mode == Header => Token::Slash, + + // Expressions in headers. '#' if self.mode == Header => self.read_hex(), '"' if self.mode == Header => self.read_string(), @@ -107,18 +117,7 @@ impl<'s> Iterator for Tokens<'s> { } impl<'s> Tokens<'s> { - fn read_whitespace(&mut self, first: char) -> Token<'s> { - // Shortcut for common case of exactly one space. - if first == ' ' && !self.s.check(|c| c.is_whitespace()) { - return Token::Space(0); - } - - // Uneat the first char if it's a newline, so that it's counted in the - // loop. - if is_newline(first) { - self.s.uneat(); - } - + fn read_whitespace(&mut self) -> Token<'s> { // Count the number of newlines. let mut newlines = 0; while let Some(c) = self.s.eat_merging_crlf() { @@ -169,27 +168,6 @@ impl<'s> Tokens<'s> { Token::BlockComment(self.s.get(start .. end)) } - fn read_hex(&mut self) -> Token<'s> { - // This parses more than the permissable 0-9, a-f, A-F character ranges - // to provide nicer error messages later. - Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) - } - - fn read_string(&mut self) -> Token<'s> { - let mut escaped = false; - Token::Str { - string: self.s.eat_until(|c| { - if c == '"' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }), - terminated: self.s.eat_if('"'), - } - } - fn read_raw(&mut self) -> Token<'s> { let mut backticks = 1; while self.s.eat_if('`') { @@ -210,11 +188,11 @@ impl<'s> Tokens<'s> { let terminated = found == backticks; let end = self.s.index() - if terminated { found } else { 0 }; - Token::Raw { - raw: self.s.get(start .. end), + Token::Raw(TokenRaw { + text: self.s.get(start .. end), backticks, terminated, - } + }) } fn read_escaped(&mut self) -> Token<'s> { @@ -228,10 +206,10 @@ impl<'s> Tokens<'s> { 'u' if self.s.peek_nth(1) == Some('{') => { self.s.eat_assert('u'); self.s.eat_assert('{'); - Token::UnicodeEscape { + Token::UnicodeEscape(TokenUnicodeEscape { sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()), terminated: self.s.eat_if('}'), - } + }) } c if c.is_whitespace() => Token::Backslash, _ => Token::Text("\\"), @@ -241,6 +219,27 @@ impl<'s> Tokens<'s> { } } + fn read_hex(&mut self) -> Token<'s> { + // This parses more than the permissable 0-9, a-f, A-F character ranges + // to provide nicer error messages later. + Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) + } + + fn read_string(&mut self) -> Token<'s> { + let mut escaped = false; + Token::Str(TokenStr { + string: self.s.eat_until(|c| { + if c == '"' && !escaped { + true + } else { + escaped = c == '\\' && !escaped; + false + } + }), + terminated: self.s.eat_if('"'), + }) + } + fn read_text_or_expr(&mut self, start: usize) -> Token<'s> { let body = self.mode == Body; let header = self.mode == Header; @@ -268,6 +267,12 @@ impl<'s> Tokens<'s> { } } +impl Debug for Tokens<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Tokens({}|{})", self.s.eaten(), self.s.rest()) + } +} + fn parse_expr(text: &str) -> Token<'_> { if let Ok(b) = text.parse::<bool>() { Token::Bool(b) @@ -303,13 +308,13 @@ mod tests { }; fn Str(string: &str, terminated: bool) -> Token { - Token::Str { string, terminated } + Token::Str(TokenStr { string, terminated }) } - fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { - Token::Raw { raw, backticks, terminated } + fn Raw(text: &str, backticks: usize, terminated: bool) -> Token { + Token::Raw(TokenRaw { text, backticks, terminated }) } fn UE(sequence: &str, terminated: bool) -> Token { - Token::UnicodeEscape { sequence, terminated } + Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated }) } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } @@ -389,36 +394,65 @@ mod tests { } #[test] + fn tokenize_escaped_symbols() { + t!(Body, r"\\" => T(r"\")); + t!(Body, r"\[" => T("[")); + t!(Body, r"\]" => T("]")); + t!(Body, r"\*" => T("*")); + t!(Body, r"\_" => T("_")); + t!(Body, r"\`" => T("`")); + t!(Body, r"\/" => T("/")); + t!(Body, r"\u{2603}" => UE("2603", true)); + t!(Body, r"\u{26A4" => UE("26A4", false)); + t!(Body, r#"\""# => T("\"")); + } + + #[test] + fn tokenize_unescapable_symbols() { + t!(Body, r"\a" => T("\\"), T("a")); + t!(Body, r"\:" => T(r"\"), T(":")); + t!(Body, r"\=" => T(r"\"), T("=")); + t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); + t!(Body, r"\u{ " => UE("", false), Space(0)); + t!(Body, r"\u" => T("\\"), T("u")); + t!(Header, r"\\\\" => Invalid(r"\\\\")); + t!(Header, r"\a" => Invalid(r"\a")); + t!(Header, r"\:" => Invalid(r"\"), Colon); + t!(Header, r"\=" => Invalid(r"\"), Equals); + t!(Header, r"\," => Invalid(r"\"), Comma); + } + + #[test] fn tokenize_header_tokens() { - t!(Header, "__main__" => Id("__main__")); - t!(Header, "_func_box" => Id("_func_box")); - t!(Header, ">main" => Invalid(">main")); - t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); - t!(Header, "{abc}" => LB, Id("abc"), RB); - t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); - t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); - t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); - t!(Header, "=3.14" => Equals, Num(3.14)); - t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); - t!(Header, "a:b" => Id("a"), Colon, Id("b")); - t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); - t!(Body, "c=d, " => T("c=d,"), S(0)); - t!(Body, "a: b" => T("a:"), S(0), T("b")); - t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), - Id("x"), Equals, Num(1.0)); + t!(Header, "__main__" => Id("__main__")); + t!(Header, "_func_box" => Id("_func_box")); + t!(Header, ">main" => Invalid(">main")); + t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); + t!(Header, "{abc}" => LB, Id("abc"), RB); + t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); + t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); + t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); + t!(Header, "=3.14" => Equals, Num(3.14)); + t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); + t!(Header, "a:b" => Id("a"), Colon, Id("b")); + t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); + t!(Body, "c=d, " => T("c=d,"), S(0)); + t!(Body, "a: b" => T("a:"), S(0), T("b")); + t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), + Id("x"), Equals, Num(1.0)); } #[test] fn tokenize_numeric_values() { - t!(Header, "12.3e5" => Num(12.3e5)); - t!(Header, "120%" => Num(1.2)); - t!(Header, "12e4%" => Num(1200.0)); - t!(Header, "1e5in" => Len(Length::inches(100000.0))); - t!(Header, "2.3cm" => Len(Length::cm(2.3))); - t!(Header, "02.4mm" => Len(Length::mm(2.4))); - t!(Header, "2.4.cm" => Invalid("2.4.cm")); - t!(Header, "#6ae6dd" => Hex("6ae6dd")); - t!(Header, "#8A083c" => Hex("8A083c")); + t!(Header, "12.3e5" => Num(12.3e5)); + t!(Header, "120%" => Num(1.2)); + t!(Header, "12e4%" => Num(1200.0)); + t!(Header, "1e5in" => Len(Length::inches(100000.0))); + t!(Header, "2.3cm" => Len(Length::cm(2.3))); + t!(Header, "02.4mm" => Len(Length::mm(2.4))); + t!(Header, "2.4.cm" => Invalid("2.4.cm")); + t!(Header, "#6ae6dd" => Hex("6ae6dd")); + t!(Header, "#8A083c" => Hex("8A083c")); } #[test] @@ -447,35 +481,6 @@ mod tests { } #[test] - fn tokenize_escaped_symbols() { - t!(Body, r"\\" => T(r"\")); - t!(Body, r"\[" => T("[")); - t!(Body, r"\]" => T("]")); - t!(Body, r"\*" => T("*")); - t!(Body, r"\_" => T("_")); - t!(Body, r"\`" => T("`")); - t!(Body, r"\/" => T("/")); - t!(Body, r"\u{2603}" => UE("2603", true)); - t!(Body, r"\u{26A4" => UE("26A4", false)); - t!(Body, r#"\""# => T("\"")); - } - - #[test] - fn tokenize_unescapable_symbols() { - t!(Body, r"\a" => T("\\"), T("a")); - t!(Body, r"\:" => T(r"\"), T(":")); - t!(Body, r"\=" => T(r"\"), T("=")); - t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); - t!(Body, r"\u{ " => UE("", false), Space(0)); - t!(Body, r"\u" => T("\\"), T("u")); - t!(Header, r"\\\\" => Invalid(r"\\\\")); - t!(Header, r"\a" => Invalid(r"\a")); - t!(Header, r"\:" => Invalid(r"\"), Colon); - t!(Header, r"\=" => Invalid(r"\"), Equals); - t!(Header, r"\," => Invalid(r"\"), Comma); - } - - #[test] fn tokenize_with_spans() { ts!(Body, "hello" => s(0, 5, T("hello"))); ts!(Body, "ab\r\nc" => s(0, 2, T("ab")), s(2, 4, S(1)), s(4, 5, T("c"))); |
