diff options
| -rw-r--r-- | src/parse/mod.rs | 10 | ||||
| -rw-r--r-- | src/parse/resolve.rs | 3 | ||||
| -rw-r--r-- | src/parse/tests.rs | 22 | ||||
| -rw-r--r-- | src/parse/tokens.rs | 725 | ||||
| -rw-r--r-- | src/syntax/token.rs | 225 |
5 files changed, 606 insertions, 379 deletions
diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 150b5ed1..c03cb63d 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -75,7 +75,7 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Node> { Token::Underscore => Node::Emph, Token::Tilde => Node::Text("\u{00A0}".into()), Token::Backslash => Node::Linebreak, - Token::Hashtag => { + Token::Hash => { if at_start { return Some(Node::Heading(heading(p))); } else { @@ -98,10 +98,10 @@ fn node(p: &mut Parser, at_start: bool) -> Option<Node> { fn heading(p: &mut Parser) -> NodeHeading { // Count hashtags. let mut level = p.span(|p| { - p.eat_assert(Token::Hashtag); + p.eat_assert(Token::Hash); let mut level = 0u8; - while p.eat_if(Token::Hashtag) { + while p.eat_if(Token::Hash) { level = level.saturating_add(1); } level @@ -240,7 +240,7 @@ fn bracket_body(p: &mut Parser) -> Tree { fn expr(p: &mut Parser) -> Option<Expr> { binops(p, term, |token| match token { Token::Plus => Some(BinOp::Add), - Token::Hyphen => Some(BinOp::Sub), + Token::Hyph => Some(BinOp::Sub), _ => None, }) } @@ -282,7 +282,7 @@ fn binops( /// Parse a factor of the form `-?value`. fn factor(p: &mut Parser) -> Option<Expr> { let op = |token| match token { - Token::Hyphen => Some(UnOp::Neg), + Token::Hyph => Some(UnOp::Neg), _ => None, }; diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index c4afc430..3adbf11f 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -17,6 +17,7 @@ pub fn resolve_string(string: &str) -> String { Some('\\') => out.push('\\'), Some('"') => out.push('"'), Some('n') => out.push('\n'), + Some('r') => out.push('\r'), Some('t') => out.push('\t'), Some('u') if s.eat_if('{') => { // TODO: Feedback if closing brace is missing. @@ -137,7 +138,7 @@ mod tests { test(r#"av\u{6797"#, "avζ"); test(r#"a\\"#, "a\\"); test(r#"a\\\nbc"#, "a\\\nbc"); - test(r#"a\tbc"#, "a\tbc"); + test(r#"a\t\r\nbc"#, "a\t\r\nbc"); test(r"π", "π"); test(r"π\", r"π\"); test(r"\π", r"\π"); diff --git a/src/parse/tests.rs b/src/parse/tests.rs index 833d6661..fd8c63ca 100644 --- a/src/parse/tests.rs +++ b/src/parse/tests.rs @@ -226,25 +226,31 @@ fn test_parse_simple_nodes() { #[test] fn test_parse_headings() { // Basics with spans. - t!("#a" - nodes: [S(0..2, Heading(S(0..1, 0), Content![@S(1..2, Text("a"))]))], + t!("# a" + nodes: [S(0..3, Heading(S(0..1, 0), Content![ + @S(1..2, Space), S(2..3, Text("a")) + ]))], spans: true); // Multiple hashtags. - t!("###three" Heading(2, Content![@Text("three")])); + t!("### three" Heading(2, Content![@Space, Text("three")])); t!("###### six" Heading(5, Content![@Space, Text("six")])); // Start of heading. t!("/**/#" Heading(0, Content![@])); - t!("[f][#ok]" Call!("f", Args![Content![Heading(0, Content![@Text("ok")])]])); + t!("[f][# ok]" Call!("f", Args![Content![Heading(0, Content![ + @Space, Text("ok") + ])]])); // End of heading. - t!("#a\nb" Heading(0, Content![@Text("a")]), Space, Text("b")); + t!("# a\nb" Heading(0, Content![@Space, Text("a")]), Space, Text("b")); // Continued heading. - t!("#a{\n1\n}b" Heading(0, Content![@Text("a"), Block(Int(1)), Text("b")])); - t!("#a[f][\n\n]d" Heading(0, Content![@ - Text("a"), Call!("f", Args![Content![Parbreak]]), Text("d"), + t!("# a{\n1\n}b" Heading(0, Content![ + @Space, Text("a"), Block(Int(1)), Text("b") + ])); + t!("# a[f][\n\n]d" Heading(0, Content![@ + Space, Text("a"), Call!("f", Args![Content![Parbreak]]), Text("d"), ])); // No heading. diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 1e49d1c6..7f162b4c 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,7 +1,7 @@ use std::fmt::{self, Debug, Formatter}; use super::{is_newline, Scanner}; -use crate::geom::LengthUnit; +use crate::geom::{AngularUnit, LengthUnit}; use crate::syntax::*; use TokenMode::*; @@ -62,20 +62,20 @@ impl<'s> Iterator for Tokens<'s> { loop { // Common elements. return Some(match c { + // Functions and blocks. + '[' => Token::LeftBracket, + ']' => Token::RightBracket, + '{' => Token::LeftBrace, + '}' => Token::RightBrace, + // Whitespace. - c if c.is_whitespace() => self.whitespace(c, start), + c if c.is_whitespace() => self.whitespace(c), // Comments. '/' if self.s.eat_if('/') => self.line_comment(), '/' if self.s.eat_if('*') => self.block_comment(), '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)), - // Functions and blocks. - '[' => Token::LeftBracket, - ']' => Token::RightBracket, - '{' => Token::LeftBrace, - '}' => Token::RightBrace, - _ => break, }); } @@ -86,96 +86,132 @@ impl<'s> Iterator for Tokens<'s> { '*' => Token::Star, '_' => Token::Underscore, '~' => Token::Tilde, - '#' => Token::Hashtag, + '#' => self.hash(start), '`' => self.raw(), - - // Escape sequences. - '\\' => self.escaped(), + '$' => self.math(), + '\\' => self.backslash(), // Plain text. _ => self.text(start), }, Header => match c { - // Syntactic elements in headers. + // Parens. '(' => Token::LeftParen, ')' => Token::RightParen, - ':' => Token::Colon, + + // Length two. + '=' if self.s.eat_if('=') => Token::EqEq, + '!' if self.s.eat_if('=') => Token::BangEq, + '<' if self.s.eat_if('=') => Token::LtEq, + '>' if self.s.eat_if('=') => Token::GtEq, + '+' if self.s.eat_if('=') => Token::PlusEq, + '-' if self.s.eat_if('=') => Token::HyphEq, + '*' if self.s.eat_if('=') => Token::StarEq, + '/' if self.s.eat_if('=') => Token::SlashEq, + '.' if self.s.eat_if('.') => Token::Dots, + '=' if self.s.eat_if('>') => Token::Arrow, + + // Length one. ',' => Token::Comma, + ':' => Token::Colon, '|' => Token::Pipe, '+' => Token::Plus, - '-' => Token::Hyphen, + '-' => Token::Hyph, '*' => Token::Star, '/' => Token::Slash, + '=' => Token::Eq, + '<' => Token::Lt, + '>' => Token::Gt, + '?' => Token::Question, + + // Identifiers. + c if is_id_start(c) => self.ident(start), + + // Numbers. + c if c.is_ascii_digit() + || (c == '.' && self.s.check(|n| n.is_ascii_digit())) => + { + self.number(start, c) + } - // Expressions in headers. + // Hex values and strings. '#' => self.hex(), '"' => self.string(), - // Expressions. - c => self.expr(c, start), + _ => Token::Invalid(self.s.eaten_from(start)), }, }) } } impl<'s> Tokens<'s> { - fn whitespace(&mut self, first: char, start: usize) -> Token<'s> { + fn whitespace(&mut self, first: char) -> Token<'s> { // Fast path for just a single space if first == ' ' && !self.s.check(|c| c.is_whitespace()) { - return Token::Space(0); - } - - self.s.jump(start); + Token::Space(0) + } else { + self.s.uneat(); + + // Count the number of newlines. + let mut newlines = 0; + while let Some(c) = self.s.eat_merging_crlf() { + if !c.is_whitespace() { + self.s.uneat(); + break; + } - // Count the number of newlines. - let mut newlines = 0; - while let Some(c) = self.s.eat_merging_crlf() { - if !c.is_whitespace() { - self.s.uneat(); - break; + if is_newline(c) { + newlines += 1; + } } - if is_newline(c) { - newlines += 1; - } + Token::Space(newlines) } - - Token::Space(newlines) - } - - fn line_comment(&mut self) -> Token<'s> { - Token::LineComment(self.s.eat_until(is_newline)) } - fn block_comment(&mut self) -> Token<'s> { - let start = self.s.index(); - - let mut state = '_'; - let mut depth = 1; - - // Find the first `*/` that does not correspond to a nested `/*`. + fn text(&mut self, start: usize) -> Token<'s> { while let Some(c) = self.s.eat() { - state = match (state, c) { - ('*', '/') => { - depth -= 1; - if depth == 0 { - break; - } - '_' - } - ('/', '*') => { - depth += 1; - '_' - } - _ => c, + if match c { + // Whitespace. + c if c.is_whitespace() => true, + // Comments. + '/' if self.s.check(|c| c == '/' || c == '*') => true, + // Parenthesis. + '[' | ']' | '{' | '}' => true, + // Markup. + '*' | '_' | '#' | '~' | '`' => true, + // Escaping. + '\\' => true, + _ => false, + } { + self.s.uneat(); + break; } } - let terminated = depth == 0; - let end = self.s.index() - if terminated { 2 } else { 0 }; + Token::Text(self.s.eaten_from(start)) + } - Token::BlockComment(self.s.get(start .. end)) + fn hash(&mut self, start: usize) -> Token<'s> { + if self.s.check(is_id_start) { + self.s.eat(); + self.s.eat_while(is_id_continue); + match self.s.eaten_from(start) { + "#let" => Token::Let, + "#if" => Token::If, + "#else" => Token::Else, + "#for" => Token::For, + "#in" => Token::In, + "#while" => Token::While, + "#break" => Token::Break, + "#continue" => Token::Continue, + "#return" => Token::Return, + s => Token::Invalid(s), + } + } else { + Token::Hash + } } fn raw(&mut self) -> Token<'s> { @@ -205,7 +241,38 @@ impl<'s> Tokens<'s> { }) } - fn escaped(&mut self) -> Token<'s> { + fn math(&mut self) -> Token<'s> { + let mut dollars = 1; + if self.s.eat_if('$') { + dollars = 2; + } + + let start = self.s.index(); + + let mut found = 0; + let mut escaped = false; + while found < dollars { + match self.s.eat() { + Some('$') if !escaped => found += 1, + Some(c) => { + found = 0; + escaped = c == '\\' && !escaped; + } + None => break, + } + } + + let terminated = found == dollars; + let end = self.s.index() - if terminated { found } else { 0 }; + + Token::Math(TokenMath { + formula: self.s.get(start .. end), + inline: dollars == 1, + terminated, + }) + } + + fn backslash(&mut self) -> Token<'s> { if let Some(c) = self.s.peek() { match c { // Backslash and comments. @@ -235,27 +302,79 @@ impl<'s> Tokens<'s> { } } - fn text(&mut self, start: usize) -> Token<'s> { - while let Some(c) = self.s.eat() { - if match c { - // Whitespace. - c if c.is_whitespace() => true, - // Comments. - '/' if self.s.check(|c| c == '/' || c == '*') => true, - // Parenthesis. - '[' | ']' | '{' | '}' => true, - // Markup. - '*' | '_' | '#' | '~' | '`' => true, - // Escaping. - '\\' => true, - _ => false, - } { - self.s.uneat(); - break; + fn ident(&mut self, start: usize) -> Token<'s> { + self.s.eat_while(is_id_continue); + match self.s.eaten_from(start) { + "not" => Token::Not, + "and" => Token::And, + "or" => Token::Or, + "let" => Token::Let, + "if" => Token::If, + "else" => Token::Else, + "for" => Token::For, + "in" => Token::In, + "while" => Token::While, + "break" => Token::Break, + "continue" => Token::Continue, + "return" => Token::Return, + "none" => Token::None, + "true" => Token::Bool(true), + "false" => Token::Bool(false), + id => Token::Ident(id), + } + } + + fn number(&mut self, start: usize, first: char) -> Token<'s> { + // Read the first part (integer or fractional depending on `first`). + self.s.eat_while(|c| c.is_ascii_digit()); + + // Read the fractional part if not already done and present. + if first != '.' && self.s.eat_if('.') { + self.s.eat_while(|c| c.is_ascii_digit()); + } + + // Read the exponent. + if self.s.eat_if('e') || self.s.eat_if('E') { + let _ = self.s.eat_if('+') || self.s.eat_if('-'); + self.s.eat_while(|c| c.is_ascii_digit()); + } + + // Read the suffix. + let suffix_start = self.s.index(); + if !self.s.eat_if('%') { + self.s.eat_while(|c| c.is_ascii_alphanumeric()); + } + + let number = self.s.get(start .. suffix_start); + let suffix = self.s.eaten_from(suffix_start); + let all = self.s.eaten_from(start); + + // Find out whether it is a simple number. + if suffix.is_empty() { + if let Ok(int) = number.parse::<i64>() { + return Token::Int(int); + } else if let Ok(float) = number.parse::<f64>() { + return Token::Float(float); } } - Token::Text(self.s.eaten_from(start)) + // Otherwise parse into the fitting numeric type. + let build = match suffix { + "%" => Token::Percent, + "pt" => |x| Token::Length(x, LengthUnit::Pt), + "mm" => |x| Token::Length(x, LengthUnit::Mm), + "cm" => |x| Token::Length(x, LengthUnit::Cm), + "in" => |x| Token::Length(x, LengthUnit::In), + "rad" => |x| Token::Angle(x, AngularUnit::Rad), + "deg" => |x| Token::Angle(x, AngularUnit::Deg), + _ => return Token::Invalid(all), + }; + + if let Ok(float) = number.parse::<f64>() { + build(float) + } else { + Token::Invalid(all) + } } fn hex(&mut self) -> Token<'s> { @@ -278,64 +397,38 @@ impl<'s> Tokens<'s> { }) } - fn expr(&mut self, first: char, start: usize) -> Token<'s> { - if is_id_start(first) { - self.ident(start) - } else if first.is_ascii_digit() - || (first == '.' && self.s.check(|c| c.is_ascii_digit())) - { - self.number(start) - } else { - Token::Invalid(self.s.eaten_from(start)) - } - } - - fn ident(&mut self, start: usize) -> Token<'s> { - self.s.eat_while(is_id_continue); - let string = self.s.eaten_from(start); - match string { - "none" => Token::None, - "true" => Token::Bool(true), - "false" => Token::Bool(false), - _ => Token::Ident(string), - } + fn line_comment(&mut self) -> Token<'s> { + Token::LineComment(self.s.eat_until(is_newline)) } - fn number(&mut self, start: usize) -> Token<'s> { - self.s.jump(start); - - // Read the integer part. - self.s.eat_while(|c| c.is_ascii_digit()); + fn block_comment(&mut self) -> Token<'s> { + let start = self.s.index(); - // Read the fractional part if present. - if self.s.eat_if('.') { - self.s.eat_while(|c| c.is_ascii_digit()); - } + let mut state = '_'; + let mut depth = 1; - // Read the exponent. - if self.s.eat_if('e') || self.s.eat_if('E') { - let _ = self.s.eat_if('+') || self.s.eat_if('-'); - self.s.eat_while(|c| c.is_ascii_digit()); + // Find the first `*/` that does not correspond to a nested `/*`. + while let Some(c) = self.s.eat() { + state = match (state, c) { + ('*', '/') => { + depth -= 1; + if depth == 0 { + break; + } + '_' + } + ('/', '*') => { + depth += 1; + '_' + } + _ => c, + } } - // Read the suffix. - if !self.s.eat_if('%') { - self.s.eat_while(|c| c.is_ascii_alphanumeric()); - } + let terminated = depth == 0; + let end = self.s.index() - if terminated { 2 } else { 0 }; - // Parse into one of the suitable types. - let string = self.s.eaten_from(start); - if let Some(percent) = parse_percent(string) { - Token::Percent(percent) - } else if let Some((val, unit)) = parse_length(string) { - Token::Length(val, unit) - } else if let Ok(int) = string.parse::<i64>() { - Token::Int(int) - } else if let Ok(float) = string.parse::<f64>() { - Token::Float(float) - } else { - Token::Invalid(string) - } + Token::BlockComment(self.s.get(start .. end)) } } @@ -345,40 +438,12 @@ impl Debug for Tokens<'_> { } } -fn parse_percent(string: &str) -> Option<f64> { - string.strip_suffix('%').and_then(|prefix| prefix.parse::<f64>().ok()) -} - -fn parse_length(string: &str) -> Option<(f64, LengthUnit)> { - let len = string.len(); - - // We need at least some number and the unit. - if len <= 2 { - return None; - } - - // We can view the string as bytes since a multibyte UTF-8 char cannot - // have valid ASCII chars as subbytes. - let split = len - 2; - let bytes = string.as_bytes(); - let unit = match &bytes[split ..] { - b"pt" => LengthUnit::Pt, - b"mm" => LengthUnit::Mm, - b"cm" => LengthUnit::Cm, - b"in" => LengthUnit::In, - _ => return None, - }; - - string[.. split].parse::<f64>().ok().map(|val| (val, unit)) -} - #[cfg(test)] #[allow(non_snake_case)] mod tests { use super::*; use crate::parse::tests::check; - use LengthUnit::*; use Option::None; use Token::{Ident, *}; @@ -386,6 +451,10 @@ mod tests { Token::Raw(TokenRaw { text, backticks, terminated }) } + fn Math(formula: &str, inline: bool, terminated: bool) -> Token { + Token::Math(TokenMath { formula, inline, terminated }) + } + fn UnicodeEscape(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated }) } @@ -424,6 +493,7 @@ mod tests { // Letter suffixes. ('a', Some(Body), "hello", Text("hello")), ('a', Some(Body), "π", Text("π")), + ('a', Some(Header), "if", If), ('a', Some(Header), "val", Ident("val")), ('a', Some(Header), "Ξ±", Ident("Ξ±")), ('a', Some(Header), "_", Ident("_")), @@ -437,9 +507,10 @@ mod tests { ('/', Some(Body), "*", Star), ('/', Some(Body), "_", Underscore), ('/', Some(Body), r"\\", Text(r"\")), + ('/', Some(Body), "#let", Let), ('/', Some(Header), "(", LeftParen), ('/', Some(Header), ":", Colon), - ('/', Some(Header), "+", Plus), + ('/', Some(Header), "+=", PlusEq), ('/', Some(Header), "#123", Hex("123")), ]; @@ -473,89 +544,129 @@ mod tests { } #[test] - fn test_length_from_str() { - assert_eq!(parse_length("2.5cm"), Some((2.5, Cm))); - assert_eq!(parse_length("1.e+2cm"), Some((100.0, Cm))); - assert_eq!(parse_length("123π"), None); + fn test_tokenize_brackets() { + // Test body. + t!(Body: "[" => LeftBracket); + t!(Body: "]" => RightBracket); + t!(Body: "{" => LeftBrace); + t!(Body: "}" => RightBrace); + t!(Body[" /"]: "(" => Text("(")); + t!(Body[" /"]: ")" => Text(")")); + + // Test header. + t!(Header: "[" => LeftBracket); + t!(Header: "]" => RightBracket); + t!(Header: "{" => LeftBrace); + t!(Header: "}" => RightBrace); + t!(Header: "(" => LeftParen); + t!(Header: ")" => RightParen); } #[test] - fn test_tokenize_whitespace() { - // Test basic whitespace. - t!(Both["a1/"]: "" => ); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: "\t" => Space(0)); - t!(Both["a1/"]: " \t" => Space(0)); - t!(Both["a1/"]: "\u{202F}" => Space(0)); - - // Test newline counting. - t!(Both["a1/"]: "\n" => Space(1)); - t!(Both["a1/"]: "\n " => Space(1)); - t!(Both["a1/"]: " \n" => Space(1)); - t!(Both["a1/"]: " \n " => Space(1)); - t!(Both["a1/"]: "\r\n" => Space(1)); - t!(Both["a1/"]: " \n\t \n " => Space(2)); - t!(Both["a1/"]: "\n\r" => Space(2)); - t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); + fn test_tokenize_body_symbols() { + // Test markup tokens. + t!(Body[" a1"]: "*" => Star); + t!(Body: "_" => Underscore); + t!(Body["a1/"]: "# " => Hash, Space(0)); + t!(Body: "~" => Tilde); + t!(Body[" "]: r"\" => Backslash); } #[test] - fn test_tokenize_line_comments() { - // Test line comment with no trailing newline. - t!(Both[""]: "//" => LineComment("")); - - // Test line comment ends at newline. - t!(Both["a1/"]: "//bc\n" => LineComment("bc"), Space(1)); - t!(Both["a1/"]: "// bc \n" => LineComment(" bc "), Space(1)); - t!(Both["a1/"]: "//bc\r\n" => LineComment("bc"), Space(1)); - - // Test nested line comments. - t!(Both["a1/"]: "//a//b\n" => LineComment("a//b"), Space(1)); + fn test_tokenize_header_symbols() { + // Test all symbols. + t!(Header: "," => Comma); + t!(Header: ":" => Colon); + t!(Header: "|" => Pipe); + t!(Header: "+" => Plus); + t!(Header: "-" => Hyph); + t!(Header[" a1"]: "*" => Star); + t!(Header[" a1"]: "/" => Slash); + t!(Header: "=" => Eq); + t!(Header: "==" => EqEq); + t!(Header: "!=" => BangEq); + t!(Header: "<" => Lt); + t!(Header: "<=" => LtEq); + t!(Header: ">" => Gt); + t!(Header: ">=" => GtEq); + t!(Header: "+=" => PlusEq); + t!(Header: "-=" => HyphEq); + t!(Header: "*=" => StarEq); + t!(Header: "/=" => SlashEq); + t!(Header: "?" => Question); + t!(Header: ".." => Dots); + t!(Header: "=>" => Arrow); + + // Test combinations. + t!(Header: "|=>" => Pipe, Arrow); + t!(Header: "<=>" => LtEq, Gt); + t!(Header[" a/"]: "..." => Dots, Invalid(".")); + + // Test hyphen as symbol vs part of identifier. + t!(Header[" /"]: "-1" => Hyph, Int(1)); + t!(Header[" /"]: "-a" => Hyph, Ident("a")); + t!(Header[" /"]: "--1" => Hyph, Hyph, Int(1)); + t!(Header[" /"]: "--_a" => Hyph, Hyph, Ident("_a")); + t!(Header[" /"]: "a-b" => Ident("a-b")); } #[test] - fn test_tokenize_block_comments() { - // Test basic block comments. - t!(Both[""]: "/*" => BlockComment("")); - t!(Both: "/**/" => BlockComment("")); - t!(Both: "/*π*/" => BlockComment("π")); - t!(Both: "/*\n*/" => BlockComment("\n")); + fn test_tokenize_keywords() { + let both = [ + ("let", Let), + ("if", If), + ("else", Else), + ("for", For), + ("in", In), + ("while", While), + ("break", Break), + ("continue", Continue), + ("return", Return), + ]; - // Test depth 1 and 2 nested block comments. - t!(Both: "/* /* */ */" => BlockComment(" /* */ ")); - t!(Both: "/*/*/**/*/*/" => BlockComment("/*/**/*/")); + for &(s, t) in &both { + t!(Header[" "]: s => t); + t!(Body[" "]: format!("#{}", s) => t); + t!(Body[" "]: format!("#{0}#{0}", s) => t, t); + t!(Body[" /"]: format!("# {}", s) => Hash, Space(0), Text(s)); + } - // Test two nested, one unclosed block comments. - t!(Both[""]: "/*/*/**/*/" => BlockComment("/*/**/*/")); + let header = [ + ("not", Not), + ("and", And), + ("or", Or), + ("none", Token::None), + ("false", Bool(false)), + ("true", Bool(true)), + ]; - // Test all combinations of up to two following slashes and stars. - t!(Both[""]: "/*" => BlockComment("")); - t!(Both[""]: "/*/" => BlockComment("/")); - t!(Both[""]: "/**" => BlockComment("*")); - t!(Both[""]: "/*//" => BlockComment("//")); - t!(Both[""]: "/*/*" => BlockComment("/*")); - t!(Both[""]: "/**/" => BlockComment("")); - t!(Both[""]: "/***" => BlockComment("**")); + for &(s, t) in &header { + t!(Header[" "]: s => t); + t!(Body[" /"]: s => Text(s)); + } + + // Test invalid case. + t!(Header[" /"]: "None" => Ident("None")); + t!(Header[" /"]: "True" => Ident("True")); + + // Test word that contains keyword. + t!(Body[" "]: "#letter" => Invalid("#letter")); + t!(Header[" /"]: "falser" => Ident("falser")); } #[test] - fn test_tokenize_body_tokens() { - // Test parentheses. - t!(Body: "[" => LeftBracket); - t!(Body: "]" => RightBracket); - t!(Body: "{" => LeftBrace); - t!(Body: "}" => RightBrace); - - // Test markup tokens. - t!(Body[" a1"]: "*" => Star); - t!(Body: "_" => Underscore); - t!(Body: "~" => Tilde); - t!(Body: "#" => Hashtag); - t!(Body[" "]: r"\" => Backslash); + fn test_tokenize_text() { + // Test basic text. + t!(Body[" /"]: "hello" => Text("hello")); + t!(Body[" /"]: "hello-world" => Text("hello-world")); - // Test header symbols. + // Test header symbols in text. + t!(Body[" /"]: "a():\"b" => Text("a():\"b")); t!(Body[" /"]: ":,=|/+-" => Text(":,=|/+-")); + + // Test text ends. + t!(Body[""]: "hello " => Text("hello"), Space(0)); + t!(Body[""]: "hello~" => Text("hello"), Tilde); } #[test] @@ -578,6 +689,21 @@ mod tests { } #[test] + fn test_tokenize_math_formulas() { + // Test basic formula. + t!(Body: "$x$" => Math("x", true, true)); + t!(Body: "$$x + y$$" => Math("x + y", false, true)); + + // Test unterminated. + t!(Body[""]: "$$x" => Math("x", false, false)); + t!(Body[""]: "$$x$\n$" => Math("x$\n$", false, false)); + + // Test escape sequences. + t!(Body: r"$$\\\$$$" => Math(r"\\\$", false, true)); + t!(Body[""]: r"$$ $\\$" => Math(r" $\\$", false, false)); + } + + #[test] fn test_tokenize_escape_sequences() { // Test escapable symbols. t!(Body: r"\\" => Text(r"\")); @@ -614,49 +740,24 @@ mod tests { } #[test] - fn test_tokenize_text() { - // Test basic text. - t!(Body[" /"]: "hello" => Text("hello")); - t!(Body[" /"]: "hello-world" => Text("hello-world")); - - // Test header symbols in text. - t!(Body[" /"]: "a():\"b" => Text("a():\"b")); - - // Test text ends. - t!(Body[""]: "hello " => Text("hello"), Space(0)); - t!(Body[""]: "hello~" => Text("hello"), Tilde); - } - - #[test] - fn test_tokenize_header_tokens() { - // Test parentheses. - t!(Header: "[" => LeftBracket); - t!(Header: "]" => RightBracket); - t!(Header: "{" => LeftBrace); - t!(Header: "}" => RightBrace); - t!(Header: "(" => LeftParen); - t!(Header: ")" => RightParen); - - // Test structural tokens. - t!(Header: ":" => Colon); - t!(Header: "," => Comma); - t!(Header: "|" => Pipe); - t!(Header: "+" => Plus); - t!(Header: "-" => Hyphen); - t!(Header[" a1"]: "*" => Star); - t!(Header[" a1"]: "/" => Slash); - - // Test hyphen parsed as symbol. - t!(Header[" /"]: "-1" => Hyphen, Int(1)); - t!(Header[" /"]: "-a" => Hyphen, Ident("a")); - t!(Header[" /"]: "--1" => Hyphen, Hyphen, Int(1)); - t!(Header[" /"]: "--_a" => Hyphen, Hyphen, Ident("_a")); - t!(Header[" /"]: "a-b" => Ident("a-b")); + fn test_tokenize_whitespace() { + // Test basic whitespace. + t!(Both["a1/"]: "" => ); + t!(Both["a1/"]: " " => Space(0)); + t!(Both["a1/"]: " " => Space(0)); + t!(Both["a1/"]: "\t" => Space(0)); + t!(Both["a1/"]: " \t" => Space(0)); + t!(Both["a1/"]: "\u{202F}" => Space(0)); - // Test some operations. - t!(Header[" /"]: "1+3" => Int(1), Plus, Int(3)); - t!(Header[" /"]: "1*3" => Int(1), Star, Int(3)); - t!(Header[" /"]: "1/3" => Int(1), Slash, Int(3)); + // Test newline counting. + t!(Both["a1/"]: "\n" => Space(1)); + t!(Both["a1/"]: "\n " => Space(1)); + t!(Both["a1/"]: " \n" => Space(1)); + t!(Both["a1/"]: " \n " => Space(1)); + t!(Both["a1/"]: "\r\n" => Space(1)); + t!(Both["a1/"]: " \n\t \n " => Space(2)); + t!(Both["a1/"]: "\n\r" => Space(2)); + t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); } #[test] @@ -677,22 +778,7 @@ mod tests { } #[test] - fn test_tokenize_keywords() { - // Test none. - t!(Header[" /"]: "none" => Token::None); - t!(Header[" /"]: "None" => Ident("None")); - - // Test valid bools. - t!(Header[" /"]: "false" => Bool(false)); - t!(Header[" /"]: "true" => Bool(true)); - - // Test invalid bools. - t!(Header[" /"]: "True" => Ident("True")); - t!(Header[" /"]: "falser" => Ident("falser")); - } - - #[test] - fn test_tokenize_numeric_values() { + fn test_tokenize_numeric() { let ints = [("7", 7), ("012", 12)]; let floats = [ (".3", 0.3), @@ -721,6 +807,7 @@ mod tests { } // Test attached numbers. + t!(Header[" /"]: ".2.3" => Float(0.2), Float(0.3)); t!(Header[" /"]: "1.2.3" => Float(1.2), Float(0.3)); t!(Header[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3)); @@ -731,20 +818,20 @@ mod tests { // Combined integers and floats. let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats.iter().copied()); - // Test percentages. - for (s, v) in nums.clone() { - t!(Header[" /"]: format!("{}%", s) => Percent(v)); - } + let suffixes = [ + ("%", Percent as fn(f64) -> Token<'static>), + ("mm", |x| Length(x, LengthUnit::Mm)), + ("pt", |x| Length(x, LengthUnit::Pt)), + ("cm", |x| Length(x, LengthUnit::Cm)), + ("in", |x| Length(x, LengthUnit::In)), + ("rad", |x| Angle(x, AngularUnit::Rad)), + ("deg", |x| Angle(x, AngularUnit::Deg)), + ]; - // Test lengths. - for &unit in &[ - LengthUnit::Mm, - LengthUnit::Pt, - LengthUnit::Cm, - LengthUnit::In, - ] { + // Numeric types. + for &(suffix, build) in &suffixes { for (s, v) in nums.clone() { - t!(Header[" /"]: format!("{}{}", s, unit) => Length(v, unit)); + t!(Header[" /"]: format!("{}{}", s, suffix) => build(v)); } } } @@ -765,6 +852,8 @@ mod tests { t!(Header: "\"hi\"" => Str("hi", true)); t!(Header: "\"hi\nthere\"" => Str("hi\nthere", true)); t!(Header: "\"π\"" => Str("π", true)); + + // Test unterminated. t!(Header[""]: "\"hi" => Str("hi", false)); // Test escaped quote. @@ -773,6 +862,45 @@ mod tests { } #[test] + fn test_tokenize_line_comments() { + // Test line comment with no trailing newline. + t!(Both[""]: "//" => LineComment("")); + + // Test line comment ends at newline. + t!(Both["a1/"]: "//bc\n" => LineComment("bc"), Space(1)); + t!(Both["a1/"]: "// bc \n" => LineComment(" bc "), Space(1)); + t!(Both["a1/"]: "//bc\r\n" => LineComment("bc"), Space(1)); + + // Test nested line comments. + t!(Both["a1/"]: "//a//b\n" => LineComment("a//b"), Space(1)); + } + + #[test] + fn test_tokenize_block_comments() { + // Test basic block comments. + t!(Both[""]: "/*" => BlockComment("")); + t!(Both: "/**/" => BlockComment("")); + t!(Both: "/*π*/" => BlockComment("π")); + t!(Both: "/*\n*/" => BlockComment("\n")); + + // Test depth 1 and 2 nested block comments. + t!(Both: "/* /* */ */" => BlockComment(" /* */ ")); + t!(Both: "/*/*/**/*/*/" => BlockComment("/*/**/*/")); + + // Test two nested, one unclosed block comments. + t!(Both[""]: "/*/*/**/*/" => BlockComment("/*/**/*/")); + + // Test all combinations of up to two following slashes and stars. + t!(Both[""]: "/*" => BlockComment("")); + t!(Both[""]: "/*/" => BlockComment("/")); + t!(Both[""]: "/**" => BlockComment("*")); + t!(Both[""]: "/*//" => BlockComment("//")); + t!(Both[""]: "/*/*" => BlockComment("/*")); + t!(Both[""]: "/**/" => BlockComment("")); + t!(Both[""]: "/***" => BlockComment("**")); + } + + #[test] fn test_tokenize_invalid() { // Test invalidly closed block comments. t!(Both: "*/" => Token::Invalid("*/")); @@ -784,11 +912,14 @@ mod tests { t!(Header: r"\:" => Invalid(r"\"), Colon); t!(Header: "mealβ" => Ident("meal"), Invalid("β")); t!(Header[" /"]: r"\a" => Invalid(r"\"), Ident("a")); - t!(Header[" /"]: ">main" => Invalid(">"), Ident("main")); // Test invalid number suffixes. t!(Header[" /"]: "1foo" => Invalid("1foo")); t!(Header: "1p%" => Invalid("1p"), Invalid("%")); t!(Header: "1%%" => Percent(1.0), Invalid("%")); + + // Test invalid keyword. + t!(Body[" /"]: "#-" => Hash, Text("-")); + t!(Body[" "]: "#do" => Invalid("#do")) } } diff --git a/src/syntax/token.rs b/src/syntax/token.rs index fb50c4ec..261f2104 100644 --- a/src/syntax/token.rs +++ b/src/syntax/token.rs @@ -1,68 +1,114 @@ -use crate::geom::LengthUnit; +use crate::geom::{AngularUnit, LengthUnit}; /// A minimal semantic entity of source code. #[derive(Debug, Copy, Clone, PartialEq)] pub enum Token<'s> { - /// A consecutive non-markup string. - Text(&'s str), - /// One or more whitespace characters. - /// - /// The contained `usize` denotes the number of newlines that were contained - /// in the whitespace. - Space(usize), - - /// A line comment with inner string contents `//<str>\n`. - LineComment(&'s str), - /// A block comment with inner string contents `/*<str>*/`. - /// - /// The comment can contain nested block comments. - BlockComment(&'s str), - - /// A left bracket: `[`. + /// A left square bracket: `[`. LeftBracket, - /// A right bracket: `]`. + /// A right square bracket: `]`. RightBracket, - /// A left brace: `{`. + /// A left curly brace: `{`. LeftBrace, - /// A right brace: `}`. + /// A right curly brace: `}`. RightBrace, - /// A left parenthesis: `(`. + /// A left round parenthesis: `(`. LeftParen, - /// A right parenthesis: `)`. + /// A right round parenthesis: `)`. RightParen, - - /// A star: `*`. + /// An asterisk: `*`. Star, /// An underscore: `_`. Underscore, + /// A hashtag: `#`. + Hash, /// A tilde: `~`. Tilde, - /// A backslash followed by whitespace: `\`. + /// A backslash followed by nothing or whitespace: `\`. Backslash, - /// A hashtag indicating a section heading: `#`. - Hashtag, - /// A raw block: `` `...` ``. - Raw(TokenRaw<'s>), - /// A unicode escape sequence: `\u{1F5FA}`. - UnicodeEscape(TokenUnicodeEscape<'s>), - - /// A colon: `:`. - Colon, /// A comma: `,`. Comma, + /// A colon: `:`. + Colon, /// A pipe: `|`. Pipe, /// A plus: `+`. Plus, /// A hyphen: `-`. - Hyphen, + Hyph, /// A slash: `/`. Slash, - + /// A single equals sign: `=`. + Eq, + /// Two equals signs: `==`. + EqEq, + /// An exclamation mark followed by an equals sign: `!=`. + BangEq, + /// A less-than sign: `<`. + Lt, + /// A less-than sign followed by an equals sign: `<=`. + LtEq, + /// A greater-than sign: `>`. + Gt, + /// A greater-than sign followed by an equals sign: `>=`. + GtEq, + /// A plus followed by an equals sign: `+=`. + PlusEq, + /// A hyphen followed by an equals sign: `-=`. + HyphEq, + /// An asterisk followed by an equals sign: `*=`. + StarEq, + /// A slash followed by an equals sign: `/=`. + SlashEq, + /// A question mark: `?`. + Question, + /// Two dots: `..`. + Dots, + /// An equals sign followed by a greater-than sign: `=>`. + Arrow, + /// The `not` operator. + Not, + /// The `and` operator. + And, + /// The `or` operator. + Or, + /// The `let` / `#let` keyword. + Let, + /// The `if` / `#if` keyword. + If, + /// The `else` / `#else` keyword. + Else, + /// The `for` / `#for` keyword. + For, + /// The `in` / `#in` keyword. + In, + /// The `while` / `#while` keyword. + While, + /// The `break` / `#break` keyword. + Break, + /// The `continue` / `#continue` keyword. + Continue, + /// The `return` / `#return` keyword. + Return, + /// The none literal: `none`. + None, + /// One or more whitespace characters. + /// + /// The contained `usize` denotes the number of newlines that were contained + /// in the whitespace. + Space(usize), + /// A consecutive non-markup string. + Text(&'s str), + /// An arbitrary number of backticks followed by inner contents, terminated + /// with the same number of backticks: `` `...` ``. + Raw(TokenRaw<'s>), + /// One or two dollar signs followed by inner contents, terminated with the + /// same number of dollar signs. + Math(TokenMath<'s>), + /// A slash and the letter "u" followed by a hexadecimal unicode entity + /// enclosed in curly braces: `\u{1F5FA}`. + UnicodeEscape(TokenUnicodeEscape<'s>), /// An identifier: `center`. Ident(&'s str), - /// A none: `none`. - None, /// A boolean: `true`, `false`. Bool(bool), /// An integer: `120`. @@ -76,11 +122,20 @@ pub enum Token<'s> { /// _Note_: `50%` is stored as `50.0` here, as in the corresponding /// [literal](super::Expr::Percent). Percent(f64), + /// An angle: `90deg`. + Angle(f64, AngularUnit), /// A hex value: `#20d82a`. Hex(&'s str), /// A quoted string: `"..."`. Str(TokenStr<'s>), - + /// Two slashes followed by inner contents, terminated with a newline: + /// `//<str>\n`. + LineComment(&'s str), + /// A slash and a star followed by inner contents, terminated with a star + /// and a slash: `/*<str>*/`. + /// + /// The comment can contain nested block comments. + BlockComment(&'s str), /// Things that are not valid tokens. Invalid(&'s str), } @@ -98,15 +153,6 @@ pub struct TokenStr<'s> { pub terminated: bool, } -/// A unicode escape sequence: `\u{1F5FA}`. -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct TokenUnicodeEscape<'s> { - /// The escape sequence between two braces. - pub sequence: &'s str, - /// Whether the closing brace was present. - pub terminated: bool, -} - /// A raw block: `` `...` ``. #[derive(Debug, Copy, Clone, PartialEq)] pub struct TokenRaw<'s> { @@ -118,48 +164,91 @@ pub struct TokenRaw<'s> { pub terminated: bool, } +/// A math formula: `$2pi + x$`, `$$f'(x) = x^2$$`. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct TokenMath<'s> { + /// The formula between the dollars. + pub formula: &'s str, + /// Whether the formula was surrounded by one dollar (true) or two dollars + /// (false). + pub inline: bool, + /// Whether the closing dollars were present. + pub terminated: bool, +} + +/// A unicode escape sequence: `\u{1F5FA}`. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct TokenUnicodeEscape<'s> { + /// The escape sequence between the braces. + pub sequence: &'s str, + /// Whether the closing brace was present. + pub terminated: bool, +} + impl<'s> Token<'s> { - /// The natural-language name of this token for use in error messages. + /// The English name of this token for use in error messages. pub fn name(self) -> &'static str { match self { - Self::Text(_) => "text", - Self::Space(_) => "space", - - Self::LineComment(_) => "line comment", - Self::BlockComment(_) => "block comment", - Self::LeftBracket => "opening bracket", Self::RightBracket => "closing bracket", Self::LeftBrace => "opening brace", Self::RightBrace => "closing brace", Self::LeftParen => "opening paren", Self::RightParen => "closing paren", - Self::Star => "star", Self::Underscore => "underscore", + Self::Hash => "hashtag", + Self::Tilde => "tilde", Self::Backslash => "backslash", - Self::Hashtag => "hashtag", - Self::Tilde => "tidle", - Self::Raw { .. } => "raw block", - Self::UnicodeEscape { .. } => "unicode escape sequence", - - Self::Colon => "colon", Self::Comma => "comma", + Self::Colon => "colon", Self::Pipe => "pipe", - Self::Plus => "plus sign", - Self::Hyphen => "minus sign", + Self::Plus => "plus", + Self::Hyph => "minus", Self::Slash => "slash", - + Self::Eq => "assignment operator", + Self::EqEq => "equality operator", + Self::BangEq => "inequality operator", + Self::Lt => "less than operator", + Self::LtEq => "less than or equal operator", + Self::Gt => "greater than operator", + Self::GtEq => "greater than or equal operator", + Self::PlusEq => "add-assign operator", + Self::HyphEq => "subtract-assign operator", + Self::StarEq => "multiply-assign operator", + Self::SlashEq => "divide-assign operator", + Self::Question => "question mark", + Self::Dots => "dots", + Self::Arrow => "arrow", + Self::Not => "not operator", + Self::And => "and operator", + Self::Or => "or operator", + Self::Let => "let keyword", + Self::If => "if keyword", + Self::Else => "else keyword", + Self::For => "for keyword", + Self::In => "in keyword", + Self::While => "while keyword", + Self::Break => "break keyword", + Self::Continue => "continue keyword", + Self::Return => "return keyword", Self::None => "none", + Self::Space(_) => "space", + Self::Text(_) => "text", + Self::Raw(_) => "raw block", + Self::Math(_) => "math formula", + Self::UnicodeEscape(_) => "unicode escape sequence", Self::Ident(_) => "identifier", - Self::Bool(_) => "bool", + Self::Bool(_) => "boolean", Self::Int(_) => "integer", Self::Float(_) => "float", Self::Length(..) => "length", + Self::Angle(..) => "angle", Self::Percent(_) => "percentage", Self::Hex(_) => "hex value", - Self::Str { .. } => "string", - + Self::Str(_) => "string", + Self::LineComment(_) => "line comment", + Self::BlockComment(_) => "block comment", Self::Invalid("*/") => "end of block comment", Self::Invalid(_) => "invalid token", } |
