summaryrefslogtreecommitdiff
path: root/src/parse/tokens.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2020-10-02 15:43:29 +0200
committerLaurenz <laurmaedje@gmail.com>2020-10-02 15:43:29 +0200
commit3533268b1f7a31581e7b8f44dff6d4f553ef348f (patch)
tree3fee21d2df7ce173131f75f46a1ef040f272ed29 /src/parse/tokens.rs
parentf8770d2b2a8ac389704897f92f2753398352835b (diff)
Refactor parser 🏞
Diffstat (limited to 'src/parse/tokens.rs')
-rw-r--r--src/parse/tokens.rs239
1 files changed, 122 insertions, 117 deletions
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index 9f30f587..72d7b2d9 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -1,17 +1,19 @@
//! Tokenization.
+use std::fmt::{self, Debug, Formatter};
+
use super::{is_newline, Scanner};
use crate::length::Length;
-use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token};
+use crate::syntax::token::*;
+use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned};
use TokenMode::*;
/// An iterator over the tokens of a string of source code.
-#[derive(Debug)]
+#[derive(Clone)]
pub struct Tokens<'s> {
s: Scanner<'s>,
mode: TokenMode,
- stack: Vec<TokenMode>,
}
/// Whether to tokenize in header mode which yields expression, comma and
@@ -26,23 +28,17 @@ pub enum TokenMode {
impl<'s> Tokens<'s> {
/// Create a new token iterator with the given mode.
pub fn new(src: &'s str, mode: TokenMode) -> Self {
- Self {
- s: Scanner::new(src),
- mode,
- stack: vec![],
- }
+ Self { s: Scanner::new(src), mode }
}
- /// Change the token mode and push the old one on a stack.
- pub fn push_mode(&mut self, mode: TokenMode) {
- self.stack.push(self.mode);
- self.mode = mode;
+ /// Get the current token mode.
+ pub fn mode(&self) -> TokenMode {
+ self.mode
}
- /// Pop the old token mode from the stack. This panics if there is no mode
- /// on the stack.
- pub fn pop_mode(&mut self) {
- self.mode = self.stack.pop().expect("no pushed mode");
+ /// Change the token mode.
+ pub fn set_mode(&mut self, mode: TokenMode) {
+ self.mode = mode;
}
/// The position in the string at which the last token ends and next token
@@ -50,6 +46,16 @@ impl<'s> Tokens<'s> {
pub fn pos(&self) -> Pos {
self.s.index().into()
}
+
+ /// Jump to a position in the source string.
+ pub fn jump(&mut self, pos: Pos) {
+ self.s.jump(pos.to_usize());
+ }
+
+ /// The underlying scanner.
+ pub fn scanner(&self) -> &Scanner<'s> {
+ &self.s
+ }
}
impl<'s> Iterator for Tokens<'s> {
@@ -59,8 +65,12 @@ impl<'s> Iterator for Tokens<'s> {
fn next(&mut self) -> Option<Self::Item> {
let start = self.s.index();
let token = match self.s.eat()? {
- // Whitespace.
- c if c.is_whitespace() => self.read_whitespace(c),
+ // Whitespace with fast path for just a single space.
+ ' ' if !self.s.check(|c| c.is_whitespace()) => Token::Space(0),
+ c if c.is_whitespace() => {
+ self.s.jump(start);
+ self.read_whitespace()
+ }
// Comments.
'/' if self.s.eat_if('/') => self.read_line_comment(),
@@ -76,8 +86,8 @@ impl<'s> Iterator for Tokens<'s> {
// Syntactic elements in body text.
'*' if self.mode == Body => Token::Star,
'_' if self.mode == Body => Token::Underscore,
- '`' if self.mode == Body => self.read_raw(),
'#' if self.mode == Body => Token::Hashtag,
+ '`' if self.mode == Body => self.read_raw(),
'~' if self.mode == Body => Token::Text("\u{00A0}"),
'\\' if self.mode == Body => self.read_escaped(),
@@ -88,12 +98,12 @@ impl<'s> Iterator for Tokens<'s> {
',' if self.mode == Header => Token::Comma,
'=' if self.mode == Header => Token::Equals,
'>' if self.mode == Header && self.s.eat_if('>') => Token::Chain,
-
- // Expressions in headers.
'+' if self.mode == Header => Token::Plus,
'-' if self.mode == Header => Token::Hyphen,
'*' if self.mode == Header => Token::Star,
'/' if self.mode == Header => Token::Slash,
+
+ // Expressions in headers.
'#' if self.mode == Header => self.read_hex(),
'"' if self.mode == Header => self.read_string(),
@@ -107,18 +117,7 @@ impl<'s> Iterator for Tokens<'s> {
}
impl<'s> Tokens<'s> {
- fn read_whitespace(&mut self, first: char) -> Token<'s> {
- // Shortcut for common case of exactly one space.
- if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
- return Token::Space(0);
- }
-
- // Uneat the first char if it's a newline, so that it's counted in the
- // loop.
- if is_newline(first) {
- self.s.uneat();
- }
-
+ fn read_whitespace(&mut self) -> Token<'s> {
// Count the number of newlines.
let mut newlines = 0;
while let Some(c) = self.s.eat_merging_crlf() {
@@ -169,27 +168,6 @@ impl<'s> Tokens<'s> {
Token::BlockComment(self.s.get(start .. end))
}
- fn read_hex(&mut self) -> Token<'s> {
- // This parses more than the permissable 0-9, a-f, A-F character ranges
- // to provide nicer error messages later.
- Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric()))
- }
-
- fn read_string(&mut self) -> Token<'s> {
- let mut escaped = false;
- Token::Str {
- string: self.s.eat_until(|c| {
- if c == '"' && !escaped {
- true
- } else {
- escaped = c == '\\' && !escaped;
- false
- }
- }),
- terminated: self.s.eat_if('"'),
- }
- }
-
fn read_raw(&mut self) -> Token<'s> {
let mut backticks = 1;
while self.s.eat_if('`') {
@@ -210,11 +188,11 @@ impl<'s> Tokens<'s> {
let terminated = found == backticks;
let end = self.s.index() - if terminated { found } else { 0 };
- Token::Raw {
- raw: self.s.get(start .. end),
+ Token::Raw(TokenRaw {
+ text: self.s.get(start .. end),
backticks,
terminated,
- }
+ })
}
fn read_escaped(&mut self) -> Token<'s> {
@@ -228,10 +206,10 @@ impl<'s> Tokens<'s> {
'u' if self.s.peek_nth(1) == Some('{') => {
self.s.eat_assert('u');
self.s.eat_assert('{');
- Token::UnicodeEscape {
+ Token::UnicodeEscape(TokenUnicodeEscape {
sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()),
terminated: self.s.eat_if('}'),
- }
+ })
}
c if c.is_whitespace() => Token::Backslash,
_ => Token::Text("\\"),
@@ -241,6 +219,27 @@ impl<'s> Tokens<'s> {
}
}
+ fn read_hex(&mut self) -> Token<'s> {
+ // This parses more than the permissable 0-9, a-f, A-F character ranges
+ // to provide nicer error messages later.
+ Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric()))
+ }
+
+ fn read_string(&mut self) -> Token<'s> {
+ let mut escaped = false;
+ Token::Str(TokenStr {
+ string: self.s.eat_until(|c| {
+ if c == '"' && !escaped {
+ true
+ } else {
+ escaped = c == '\\' && !escaped;
+ false
+ }
+ }),
+ terminated: self.s.eat_if('"'),
+ })
+ }
+
fn read_text_or_expr(&mut self, start: usize) -> Token<'s> {
let body = self.mode == Body;
let header = self.mode == Header;
@@ -268,6 +267,12 @@ impl<'s> Tokens<'s> {
}
}
+impl Debug for Tokens<'_> {
+ fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+ write!(f, "Tokens({}|{})", self.s.eaten(), self.s.rest())
+ }
+}
+
fn parse_expr(text: &str) -> Token<'_> {
if let Ok(b) = text.parse::<bool>() {
Token::Bool(b)
@@ -303,13 +308,13 @@ mod tests {
};
fn Str(string: &str, terminated: bool) -> Token {
- Token::Str { string, terminated }
+ Token::Str(TokenStr { string, terminated })
}
- fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token {
- Token::Raw { raw, backticks, terminated }
+ fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
+ Token::Raw(TokenRaw { text, backticks, terminated })
}
fn UE(sequence: &str, terminated: bool) -> Token {
- Token::UnicodeEscape { sequence, terminated }
+ Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated })
}
macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
@@ -389,36 +394,65 @@ mod tests {
}
#[test]
+ fn tokenize_escaped_symbols() {
+ t!(Body, r"\\" => T(r"\"));
+ t!(Body, r"\[" => T("["));
+ t!(Body, r"\]" => T("]"));
+ t!(Body, r"\*" => T("*"));
+ t!(Body, r"\_" => T("_"));
+ t!(Body, r"\`" => T("`"));
+ t!(Body, r"\/" => T("/"));
+ t!(Body, r"\u{2603}" => UE("2603", true));
+ t!(Body, r"\u{26A4" => UE("26A4", false));
+ t!(Body, r#"\""# => T("\""));
+ }
+
+ #[test]
+ fn tokenize_unescapable_symbols() {
+ t!(Body, r"\a" => T("\\"), T("a"));
+ t!(Body, r"\:" => T(r"\"), T(":"));
+ t!(Body, r"\=" => T(r"\"), T("="));
+ t!(Body, r"\u{2GA4" => UE("2", false), T("GA4"));
+ t!(Body, r"\u{ " => UE("", false), Space(0));
+ t!(Body, r"\u" => T("\\"), T("u"));
+ t!(Header, r"\\\\" => Invalid(r"\\\\"));
+ t!(Header, r"\a" => Invalid(r"\a"));
+ t!(Header, r"\:" => Invalid(r"\"), Colon);
+ t!(Header, r"\=" => Invalid(r"\"), Equals);
+ t!(Header, r"\," => Invalid(r"\"), Comma);
+ }
+
+ #[test]
fn tokenize_header_tokens() {
- t!(Header, "__main__" => Id("__main__"));
- t!(Header, "_func_box" => Id("_func_box"));
- t!(Header, ">main" => Invalid(">main"));
- t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
- t!(Header, "{abc}" => LB, Id("abc"), RB);
- t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP);
- t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0)));
- t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
- t!(Header, "=3.14" => Equals, Num(3.14));
- t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
- t!(Header, "a:b" => Id("a"), Colon, Id("b"));
- t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
- t!(Body, "c=d, " => T("c=d,"), S(0));
- t!(Body, "a: b" => T("a:"), S(0), T("b"));
- t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
- Id("x"), Equals, Num(1.0));
+ t!(Header, "__main__" => Id("__main__"));
+ t!(Header, "_func_box" => Id("_func_box"));
+ t!(Header, ">main" => Invalid(">main"));
+ t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
+ t!(Header, "{abc}" => LB, Id("abc"), RB);
+ t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP);
+ t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0)));
+ t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
+ t!(Header, "=3.14" => Equals, Num(3.14));
+ t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
+ t!(Header, "a:b" => Id("a"), Colon, Id("b"));
+ t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
+ t!(Body, "c=d, " => T("c=d,"), S(0));
+ t!(Body, "a: b" => T("a:"), S(0), T("b"));
+ t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
+ Id("x"), Equals, Num(1.0));
}
#[test]
fn tokenize_numeric_values() {
- t!(Header, "12.3e5" => Num(12.3e5));
- t!(Header, "120%" => Num(1.2));
- t!(Header, "12e4%" => Num(1200.0));
- t!(Header, "1e5in" => Len(Length::inches(100000.0)));
- t!(Header, "2.3cm" => Len(Length::cm(2.3)));
- t!(Header, "02.4mm" => Len(Length::mm(2.4)));
- t!(Header, "2.4.cm" => Invalid("2.4.cm"));
- t!(Header, "#6ae6dd" => Hex("6ae6dd"));
- t!(Header, "#8A083c" => Hex("8A083c"));
+ t!(Header, "12.3e5" => Num(12.3e5));
+ t!(Header, "120%" => Num(1.2));
+ t!(Header, "12e4%" => Num(1200.0));
+ t!(Header, "1e5in" => Len(Length::inches(100000.0)));
+ t!(Header, "2.3cm" => Len(Length::cm(2.3)));
+ t!(Header, "02.4mm" => Len(Length::mm(2.4)));
+ t!(Header, "2.4.cm" => Invalid("2.4.cm"));
+ t!(Header, "#6ae6dd" => Hex("6ae6dd"));
+ t!(Header, "#8A083c" => Hex("8A083c"));
}
#[test]
@@ -447,35 +481,6 @@ mod tests {
}
#[test]
- fn tokenize_escaped_symbols() {
- t!(Body, r"\\" => T(r"\"));
- t!(Body, r"\[" => T("["));
- t!(Body, r"\]" => T("]"));
- t!(Body, r"\*" => T("*"));
- t!(Body, r"\_" => T("_"));
- t!(Body, r"\`" => T("`"));
- t!(Body, r"\/" => T("/"));
- t!(Body, r"\u{2603}" => UE("2603", true));
- t!(Body, r"\u{26A4" => UE("26A4", false));
- t!(Body, r#"\""# => T("\""));
- }
-
- #[test]
- fn tokenize_unescapable_symbols() {
- t!(Body, r"\a" => T("\\"), T("a"));
- t!(Body, r"\:" => T(r"\"), T(":"));
- t!(Body, r"\=" => T(r"\"), T("="));
- t!(Body, r"\u{2GA4" => UE("2", false), T("GA4"));
- t!(Body, r"\u{ " => UE("", false), Space(0));
- t!(Body, r"\u" => T("\\"), T("u"));
- t!(Header, r"\\\\" => Invalid(r"\\\\"));
- t!(Header, r"\a" => Invalid(r"\a"));
- t!(Header, r"\:" => Invalid(r"\"), Colon);
- t!(Header, r"\=" => Invalid(r"\"), Equals);
- t!(Header, r"\," => Invalid(r"\"), Comma);
- }
-
- #[test]
fn tokenize_with_spans() {
ts!(Body, "hello" => s(0, 5, T("hello")));
ts!(Body, "ab\r\nc" => s(0, 2, T("ab")), s(2, 4, S(1)), s(4, 5, T("c")));