From 3533268b1f7a31581e7b8f44dff6d4f553ef348f Mon Sep 17 00:00:00 2001 From: Laurenz Date: Fri, 2 Oct 2020 15:43:29 +0200 Subject: =?UTF-8?q?Refactor=20parser=20=F0=9F=8F=9E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parse/mod.rs | 919 +++++++++++++++++++++------------------------------ src/parse/parser.rs | 292 ++++++++++++++++ src/parse/resolve.rs | 26 +- src/parse/scanner.rs | 38 ++- src/parse/tests.rs | 22 +- src/parse/tokens.rs | 239 +++++++------- 6 files changed, 840 insertions(+), 696 deletions(-) create mode 100644 src/parse/parser.rs (limited to 'src/parse') diff --git a/src/parse/mod.rs b/src/parse/mod.rs index cc0b6378..2f34357c 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,11 +1,13 @@ //! Parsing and tokenization. mod lines; +mod parser; mod resolve; mod scanner; mod tokens; pub use lines::*; +pub use parser::*; pub use resolve::*; pub use scanner::*; pub use tokens::*; @@ -15,634 +17,469 @@ use std::str::FromStr; use crate::color::RgbaColor; use crate::eval::DictKey; use crate::syntax::*; -use crate::{Feedback, Pass}; +use crate::Pass; /// Parse a string of source code. pub fn parse(src: &str) -> Pass { - Parser::new(src).parse() + let mut p = Parser::new(src); + Pass::new(tree(&mut p), p.finish()) } -struct Parser<'s> { - tokens: Tokens<'s>, - peeked: Option>>>, - delimiters: Vec<(Pos, Token<'static>)>, - at_block_or_line_start: bool, - feedback: Feedback, -} - -impl<'s> Parser<'s> { - fn new(src: &'s str) -> Self { - Self { - tokens: Tokens::new(src, TokenMode::Body), - peeked: None, - delimiters: vec![], - at_block_or_line_start: true, - feedback: Feedback::new(), +/// Parse a syntax tree. +fn tree(p: &mut Parser) -> SynTree { + // We keep track of whether we are at the start of a block or paragraph + // to know whether headings are allowed. + let mut at_start = true; + let mut tree = vec![]; + while !p.eof() { + if let Some(node) = node(p, at_start) { + if node.v == SynNode::Parbreak { + at_start = true; + } else if node.v != SynNode::Space { + at_start = false; + } + tree.push(node); } } - - fn parse(mut self) -> Pass { - let tree = self.parse_body_contents(); - Pass::new(tree, self.feedback) - } + tree } -// Typesetting content. -impl Parser<'_> { - fn parse_body_contents(&mut self) -> SynTree { - let mut tree = SynTree::new(); - - self.at_block_or_line_start = true; - while !self.eof() { - if let Some(node) = self.parse_node() { - tree.push(node); +/// Parse a syntax node. +fn node(p: &mut Parser, at_start: bool) -> Option> { + let token = p.eat()?; + let span = token.span; + Some(match token.v { + // Spaces. + Token::Space(newlines) => { + if newlines < 2 { + SynNode::Space.span_with(span) + } else { + SynNode::Parbreak.span_with(span) } } - - tree - } - - fn parse_node(&mut self) -> Option> { - let token = self.peek()?; - let end = Span::at(token.span.end); - - // Set block or line start to false because most nodes have that effect, but - // remember the old value to actually check it for hashtags and because comments - // and spaces want to retain it. - let was_at_block_or_line_start = self.at_block_or_line_start; - self.at_block_or_line_start = false; - - Some(match token.v { - // Starting from two newlines counts as a paragraph break, a single - // newline does not. - Token::Space(n) => { - if n == 0 { - self.at_block_or_line_start = was_at_block_or_line_start; - } else if n >= 1 { - self.at_block_or_line_start = true; - } - - self.with_span(if n >= 2 { SynNode::Parbreak } else { SynNode::Spacing }) - } - - Token::LineComment(_) | Token::BlockComment(_) => { - self.at_block_or_line_start = was_at_block_or_line_start; - self.eat(); - return None; - } - - Token::LeftBracket => { - let call = self.parse_bracket_call(false); - self.at_block_or_line_start = false; - call.map(|c| SynNode::Expr(Expr::Call(c))) - } - - Token::Star => self.with_span(SynNode::ToggleBolder), - Token::Underscore => self.with_span(SynNode::ToggleItalic), - Token::Backslash => self.with_span(SynNode::Linebreak), - - Token::Hashtag if was_at_block_or_line_start => { - self.parse_heading().map(SynNode::Heading) - } - - Token::Raw { raw, backticks, terminated } => { - if !terminated { - error!(@self.feedback, end, "expected backtick(s)"); - } - - let raw = resolve::resolve_raw(raw, backticks); - self.with_span(SynNode::Raw(raw)) - } - - Token::Text(text) => self.with_span(SynNode::Text(text.to_string())), - Token::Hashtag => self.with_span(SynNode::Text("#".to_string())), - - Token::UnicodeEscape { sequence, terminated } => { - if !terminated { - error!(@self.feedback, end, "expected closing brace"); - } - - if let Some(c) = resolve::resolve_hex(sequence) { - self.with_span(SynNode::Text(c.to_string())) - } else { - error!(@self.feedback, token.span, "invalid unicode escape sequence"); - // TODO: Decide whether to render the escape sequence. - self.eat(); - return None; - } - } - - unexpected => { - error!(@self.feedback, token.span, "unexpected {}", unexpected.name()); - self.eat(); - return None; + Token::Text(text) => SynNode::Text(text.into()).span_with(span), + + // Comments. + Token::LineComment(_) | Token::BlockComment(_) => return None, + + // Markup. + Token::Star => SynNode::ToggleBolder.span_with(span), + Token::Underscore => SynNode::ToggleItalic.span_with(span), + Token::Backslash => SynNode::Linebreak.span_with(span), + Token::Hashtag => { + if at_start { + heading(p, span.start).map(SynNode::Heading) + } else { + SynNode::Text(p.get(span).into()).span_with(span) } - }) - } - - fn parse_heading(&mut self) -> Spanned { - let start = self.pos(); - self.assert(Token::Hashtag); - - let mut level = 0; - while self.peekv() == Some(Token::Hashtag) { - level += 1; - self.eat(); } + Token::Raw(token) => raw(p, token, span).map(SynNode::Raw), + Token::UnicodeEscape(token) => unicode_escape(p, token, span).map(SynNode::Text), - let span = Span::new(start, self.pos()); - let level = level.span_with(span); - - if level.v > 5 { - warning!( - @self.feedback, level.span, - "section depth larger than 6 has no effect", - ); + // Functions. + Token::LeftBracket => { + p.jump(span.start); + bracket_call(p).map(Expr::Call).map(SynNode::Expr) } - self.skip_ws(); - - let mut tree = SynTree::new(); - while !self.eof() && !matches!(self.peekv(), Some(Token::Space(n)) if n >= 1) { - if let Some(node) = self.parse_node() { - tree.push(node); - } + // Bad tokens. + _ => { + p.diag_unexpected(token); + return None; } - - let span = Span::new(start, self.pos()); - NodeHeading { level, contents: tree }.span_with(span) - } + }) } -// Function calls. -impl Parser<'_> { - fn parse_bracket_call(&mut self, chained: bool) -> Spanned { - let before_bracket = self.pos(); - if !chained { - self.start_group(Group::Bracket); - self.tokens.push_mode(TokenMode::Header); +/// Parse a heading. +fn heading(p: &mut Parser, start: Pos) -> Spanned { + // Parse the section depth. + let count = p.eat_while(|c| c == Token::Hashtag); + let span = (start, p.pos()); + let level = (count.min(5) as u8).span_with(span); + if count > 5 { + p.diag(warning!(span, "section depth larger than 6 has no effect")); + } + + // Parse the heading contents. + p.skip_white(); + let mut contents = vec![]; + while p.check(|t| !matches!(t, Token::Space(n) if n >= 1)) { + if let Some(node) = node(p, false) { + contents.push(node); } + } - let before_name = self.pos(); - self.start_group(Group::Subheader); - self.skip_ws(); - let name = self.parse_ident().unwrap_or_else(|| { - self.expected_found_or_at("function name", before_name); - Ident(String::new()).span_with(Span::at(before_name)) - }); - - self.skip_ws(); - - let mut args = match self.eatv() { - Some(Token::Colon) => self.parse_dict_contents().0, - Some(_) => { - self.expected_at("colon", name.span.end); - while self.eat().is_some() {} - LitDict::default() - } - None => LitDict::default(), - }; - - self.end_group(); - self.skip_ws(); - let (has_chained_child, end) = if self.peek().is_some() { - let item = self.parse_bracket_call(true); - let span = item.span; - let tree = vec![item.map(|c| SynNode::Expr(Expr::Call(c)))]; - let expr = Expr::Lit(Lit::Content(tree)); - args.0.push(LitDictEntry { key: None, value: expr.span_with(span) }); - (true, span.end) - } else { - self.tokens.pop_mode(); - (false, self.end_group().end) - }; - - let start = if chained { before_name } else { before_bracket }; - let mut span = Span::new(start, end); - - if self.check(Token::LeftBracket) && !has_chained_child { - self.start_group(Group::Bracket); - self.tokens.push_mode(TokenMode::Body); - let body = self.parse_body_contents(); - self.tokens.pop_mode(); - let body_span = self.end_group(); + NodeHeading { level, contents }.span_with((start, p.pos())) +} - let expr = Expr::Lit(Lit::Content(body)); - args.0.push(LitDictEntry { - key: None, - value: expr.span_with(body_span), - }); - span.expand(body_span); - } +/// Parse a raw block. +fn raw(p: &mut Parser, token: TokenRaw, span: Span) -> Spanned { + let raw = resolve::resolve_raw(token.text, token.backticks); - ExprCall { name, args }.span_with(span) + if !token.terminated { + p.diag(error!(span.end, "expected backtick(s)")); } - fn parse_paren_call(&mut self, name: Spanned) -> Spanned { - self.start_group(Group::Paren); - let args = self.parse_dict_contents().0; - let args_span = self.end_group(); - let span = Span::merge(name.span, args_span); - ExprCall { name, args }.span_with(span) - } + raw.span_with(span) } -// Dicts. -impl Parser<'_> { - fn parse_dict_contents(&mut self) -> (LitDict, bool) { - let mut dict = LitDict::default(); - let mut comma_and_keyless = true; - - while { - self.skip_ws(); - !self.eof() - } { - let (key, value) = if let Some(ident) = self.parse_ident() { - self.skip_ws(); - - match self.peekv() { - Some(Token::Equals) => { - self.eat(); - self.skip_ws(); - if let Some(value) = self.parse_expr() { - (Some(ident.map(|id| DictKey::Str(id.0))), value) - } else { - self.expected("value"); - continue; - } - } - - Some(Token::LeftParen) => { - let call = self.parse_paren_call(ident); - (None, call.map(Expr::Call)) - } - - _ => (None, ident.map(|id| Expr::Lit(Lit::Ident(id)))), - } - } else if let Some(value) = self.parse_expr() { - (None, value) - } else { - self.expected("value"); - continue; - }; - - if let Some(key) = &key { - comma_and_keyless = false; - self.feedback - .decorations - .push(Decoration::DictKey.span_with(key.span)); - } - - let behind = value.span.end; - dict.0.push(LitDictEntry { key, value }); +/// Parse a unicode escape sequence. +fn unicode_escape( + p: &mut Parser, + token: TokenUnicodeEscape, + span: Span, +) -> Spanned { + let text = if let Some(c) = resolve::resolve_hex(token.sequence) { + c.to_string() + } else { + // Print out the escape sequence verbatim if it is + // invalid. + p.diag(error!(span, "invalid unicode escape sequence")); + p.get(span).into() + }; + + if !token.terminated { + p.diag(error!(span.end, "expected closing brace")); + } + + text.span_with(span) +} - if { - self.skip_ws(); - self.eof() - } { - break; - } +/// Parse a bracketed function call. +fn bracket_call(p: &mut Parser) -> Spanned { + let before_bracket = p.pos(); + p.start_group(Group::Bracket); + p.push_mode(TokenMode::Header); - self.expect_at(Token::Comma, behind); - comma_and_keyless = false; - } + // One header is guaranteed, but there may be more (through chaining). + let mut outer = vec![]; + let mut inner = bracket_subheader(p); - let coercable = comma_and_keyless && !dict.0.is_empty(); - (dict, coercable) + while p.eat_if(Token::Chain).is_some() { + outer.push(inner); + inner = bracket_subheader(p); } -} -// Expressions and values. -impl Parser<'_> { - fn parse_expr(&mut self) -> Option> { - self.parse_binops("summand", Self::parse_term, |token| match token { - Token::Plus => Some(BinOp::Add), - Token::Hyphen => Some(BinOp::Sub), - _ => None, - }) - } + p.pop_mode(); + p.end_group(); - fn parse_term(&mut self) -> Option> { - self.parse_binops("factor", Self::parse_factor, |token| match token { - Token::Star => Some(BinOp::Mul), - Token::Slash => Some(BinOp::Div), - _ => None, - }) + if p.peek() == Some(Token::LeftBracket) { + let expr = bracket_body(p).map(Lit::Content).map(Expr::Lit); + inner.span.expand(expr.span); + inner.v.args.0.push(LitDictEntry { key: None, expr }); } - /// Parse expression of the form ` ( )*`. - fn parse_binops( - &mut self, - operand_name: &str, - mut parse_operand: impl FnMut(&mut Self) -> Option>, - mut parse_op: impl FnMut(Token) -> Option, - ) -> Option> { - let mut left = parse_operand(self)?; - - self.skip_ws(); - while let Some(token) = self.peek() { - if let Some(op) = parse_op(token.v) { - self.eat(); - self.skip_ws(); - - if let Some(right) = parse_operand(self) { - let span = Span::merge(left.span, right.span); - let expr = Expr::Binary(ExprBinary { - lhs: left.map(Box::new), - op: op.span_with(token.span), - rhs: right.map(Box::new), - }); - left = expr.span_with(span); - self.skip_ws(); - continue; - } + while let Some(mut top) = outer.pop() { + let span = inner.span; + let node = inner.map(Expr::Call).map(SynNode::Expr); + let expr = Expr::Lit(Lit::Content(vec![node])).span_with(span); + top.v.args.0.push(LitDictEntry { key: None, expr }); + inner = top; + } - error!( - @self.feedback, Span::merge(left.span, token.span), - "missing right {}", operand_name, - ); - } - break; - } + inner.v.span_with((before_bracket, p.pos())) +} - Some(left) - } +/// Parse one subheader of a bracketed function call. +fn bracket_subheader(p: &mut Parser) -> Spanned { + p.start_group(Group::Subheader); + let before_name = p.pos(); - fn parse_factor(&mut self) -> Option> { - if let Some(hyph) = self.check_eat(Token::Hyphen) { - self.skip_ws(); - if let Some(factor) = self.parse_factor() { - let span = Span::merge(hyph.span, factor.span); - let expr = Expr::Unary(ExprUnary { - op: UnOp::Neg.span_with(hyph.span), - expr: factor.map(Box::new), - }); - Some(expr.span_with(span)) - } else { - error!(@self.feedback, hyph.span, "dangling minus"); - None - } + p.skip_white(); + let name = ident(p).unwrap_or_else(|| { + if p.eof() { + p.diag_expected_at("function name", before_name); } else { - self.parse_value() + p.diag_expected("function name"); } - } - - fn parse_value(&mut self) -> Option> { - let Spanned { v: token, span } = self.peek()?; - Some(match token { - // This could be a function call or an identifier. - Token::Ident(id) => { - let name = Ident(id.to_string()).span_with(span); - self.eat(); - self.skip_ws(); - if self.check(Token::LeftParen) { - self.parse_paren_call(name).map(Expr::Call) - } else { - name.map(|n| Expr::Lit(Lit::Ident(n))) - } - } - - Token::Str { string, terminated } => { - if !terminated { - self.expected_at("quote", span.end); - } - self.with_span(Expr::Lit(Lit::Str(resolve::resolve_string(string)))) - } - - Token::Bool(b) => self.with_span(Expr::Lit(Lit::Bool(b))), - Token::Number(n) => self.with_span(Expr::Lit(Lit::Float(n))), - Token::Length(s) => self.with_span(Expr::Lit(Lit::Length(s))), - Token::Hex(s) => { - let color = RgbaColor::from_str(s).unwrap_or_else(|_| { - // Heal color by assuming black. - error!(@self.feedback, span, "invalid color"); - RgbaColor::new_healed(0, 0, 0, 255) - }); - self.with_span(Expr::Lit(Lit::Color(color))) - } - - // This could be a dictionary or a parenthesized expression. We - // parse as a dictionary in any case and coerce into a value if - // that's coercable (length 1 and no trailing comma). - Token::LeftParen => { - self.start_group(Group::Paren); - let (dict, coercable) = self.parse_dict_contents(); - let span = self.end_group(); - - let expr = if coercable { - dict.0.into_iter().next().expect("dict is coercable").value.v - } else { - Expr::Lit(Lit::Dict(dict)) - }; - - expr.span_with(span) - } - - // This is a content expression. - Token::LeftBrace => { - self.start_group(Group::Brace); - self.tokens.push_mode(TokenMode::Body); - let tree = self.parse_body_contents(); - self.tokens.pop_mode(); - let span = self.end_group(); - Expr::Lit(Lit::Content(tree)).span_with(span) - } + Ident(String::new()).span_with(before_name) + }); + + p.skip_white(); + let args = if p.eat_if(Token::Colon).is_some() { + dict_contents(p).0 + } else { + // Ignore the rest if there's no colon. + if !p.eof() { + p.diag_expected_at("colon", p.pos()); + } + p.eat_while(|_| true); + LitDict::new() + }; - // This is a bracketed function call. - Token::LeftBracket => { - let call = self.parse_bracket_call(false); - let tree = vec![call.map(|c| SynNode::Expr(Expr::Call(c)))]; - Expr::Lit(Lit::Content(tree)).span_with(span) - } + ExprCall { name, args }.span_with(p.end_group()) +} - _ => return None, - }) - } +/// Parse the body of a bracketed function call. +fn bracket_body(p: &mut Parser) -> Spanned { + p.start_group(Group::Bracket); + p.push_mode(TokenMode::Body); + let tree = tree(p); + p.pop_mode(); + tree.span_with(p.end_group()) +} - fn parse_ident(&mut self) -> Option> { - self.peek().and_then(|token| match token.v { - Token::Ident(id) => Some(self.with_span(Ident(id.to_string()))), - _ => None, - }) - } +/// Parse an expression: `term (+ term)*`. +fn expr(p: &mut Parser) -> Option> { + binops(p, "summand", term, |token| match token { + Token::Plus => Some(BinOp::Add), + Token::Hyphen => Some(BinOp::Sub), + _ => None, + }) } -// Error handling. -impl Parser<'_> { - fn expect_at(&mut self, token: Token<'_>, pos: Pos) -> bool { - if self.check(token) { - self.eat(); - true - } else { - self.expected_at(token.name(), pos); - false - } - } +/// Parse a term: `factor (* factor)*`. +fn term(p: &mut Parser) -> Option> { + binops(p, "factor", factor, |token| match token { + Token::Star => Some(BinOp::Mul), + Token::Slash => Some(BinOp::Div), + _ => None, + }) +} - fn expected(&mut self, thing: &str) { - if let Some(found) = self.eat() { - error!( - @self.feedback, found.span, - "expected {}, found {}", thing, found.v.name(), - ); +/// Parse binary operations of the from `a ( b)*`. +fn binops( + p: &mut Parser, + operand_name: &str, + operand: fn(&mut Parser) -> Option>, + op: fn(Token) -> Option, +) -> Option> { + let mut lhs = operand(p)?; + + loop { + p.skip_white(); + if let Some(op) = p.eat_map(op) { + p.skip_white(); + + if let Some(rhs) = operand(p) { + let span = lhs.span.join(rhs.span); + let expr = Expr::Binary(ExprBinary { + lhs: lhs.map(Box::new), + op, + rhs: rhs.map(Box::new), + }); + lhs = expr.span_with(span); + p.skip_white(); + } else { + let span = lhs.span.join(op.span); + p.diag(error!(span, "missing right {}", operand_name)); + break; + } } else { - error!(@self.feedback, Span::at(self.pos()), "expected {}", thing); + break; } } - fn expected_at(&mut self, thing: &str, pos: Pos) { - error!(@self.feedback, Span::at(pos), "expected {}", thing); - } + Some(lhs) +} - fn expected_found_or_at(&mut self, thing: &str, pos: Pos) { - if self.eof() { - self.expected_at(thing, pos) +/// Parse a factor of the form `-?value`. +fn factor(p: &mut Parser) -> Option> { + if let Some(op) = p.eat_map(|token| match token { + Token::Hyphen => Some(UnOp::Neg), + _ => None, + }) { + p.skip_white(); + if let Some(expr) = factor(p) { + let span = op.span.join(expr.span); + let expr = Expr::Unary(ExprUnary { op, expr: expr.map(Box::new) }); + Some(expr.span_with(span)) } else { - self.expected(thing); + p.diag(error!(op.span, "missing factor")); + None } + } else { + value(p) } } -// Parsing primitives. -impl<'s> Parser<'s> { - fn start_group(&mut self, group: Group) { - let start = self.pos(); - if let Some(start_token) = group.start() { - self.assert(start_token); +/// Parse a value. +fn value(p: &mut Parser) -> Option> { + let Spanned { v: token, span } = p.eat()?; + Some(match token { + // Bracketed function call. + Token::LeftBracket => { + p.jump(span.start); + let call = bracket_call(p); + let span = call.span; + let node = call.map(Expr::Call).map(SynNode::Expr); + Expr::Lit(Lit::Content(vec![node])).span_with(span) } - self.delimiters.push((start, group.end())); - } - fn end_group(&mut self) -> Span { - let peeked = self.peek(); - - let (start, end_token) = self.delimiters.pop().expect("group was not started"); + // Content expression. + Token::LeftBrace => { + p.jump(span.start); + content(p).map(Lit::Content).map(Expr::Lit) + } - if end_token != Token::Chain && peeked != None { - self.delimiters.push((start, end_token)); - assert_eq!(peeked, None, "unfinished group"); + // Dictionary or just a parenthesized expression. + Token::LeftParen => { + p.jump(span.start); + parenthesized(p) } - match self.peeked.unwrap() { - Some(token) if token.v == end_token => { - self.peeked = None; - Span::new(start, token.span.end) - } - _ => { - let end = self.pos(); - if end_token != Token::Chain { - error!( - @self.feedback, Span::at(end), - "expected {}", end_token.name(), - ); - } - Span::new(start, end) + // Function or just ident. + Token::Ident(id) => { + let ident = Ident(id.into()).span_with(span); + + p.skip_white(); + if p.peek() == Some(Token::LeftParen) { + paren_call(p, ident).map(Expr::Call) + } else { + ident.map(Lit::Ident).map(Expr::Lit) } } - } - fn skip_ws(&mut self) { - while matches!( - self.peekv(), - Some(Token::Space(_)) | - Some(Token::LineComment(_)) | - Some(Token::BlockComment(_)) - ) { - self.eat(); + // Atomic values. + Token::Bool(b) => Expr::Lit(Lit::Bool(b)).span_with(span), + Token::Number(f) => Expr::Lit(Lit::Float(f)).span_with(span), + Token::Length(l) => Expr::Lit(Lit::Length(l)).span_with(span), + Token::Hex(hex) => color(p, hex, span).map(Lit::Color).map(Expr::Lit), + Token::Str(token) => string(p, token, span).map(Lit::Str).map(Expr::Lit), + + // No value. + _ => { + p.jump(span.start); + return None; } - } + }) +} - fn eatv(&mut self) -> Option> { - self.eat().map(Spanned::value) - } +// Parse a content expression: `{...}`. +fn content(p: &mut Parser) -> Spanned { + p.start_group(Group::Brace); + p.push_mode(TokenMode::Body); + let tree = tree(p); + p.pop_mode(); + tree.span_with(p.end_group()) +} - fn peekv(&mut self) -> Option> { - self.peek().map(Spanned::value) - } +/// Parse a parenthesized expression: `(a + b)`, `(1, key="value"). +fn parenthesized(p: &mut Parser) -> Spanned { + p.start_group(Group::Paren); + let (dict, coercable) = dict_contents(p); + let expr = if coercable { + dict.0.into_iter().next().expect("dict is coercable").expr.v + } else { + Expr::Lit(Lit::Dict(dict)) + }; + expr.span_with(p.end_group()) +} - fn assert(&mut self, token: Token<'_>) { - assert!(self.check_eat(token).is_some()); - } +/// Parse a parenthesized function call. +fn paren_call(p: &mut Parser, name: Spanned) -> Spanned { + p.start_group(Group::Paren); + let args = dict_contents(p).0; + let span = name.span.join(p.end_group()); + ExprCall { name, args }.span_with(span) +} - fn check_eat(&mut self, token: Token<'_>) -> Option>> { - if self.check(token) { self.eat() } else { None } - } +/// Parse the contents of a dictionary. +fn dict_contents(p: &mut Parser) -> (LitDict, bool) { + let mut dict = LitDict::new(); + let mut comma_and_keyless = true; - /// Checks if the next token is of some kind - fn check(&mut self, token: Token<'_>) -> bool { - self.peekv() == Some(token) - } + loop { + p.skip_white(); + if p.eof() { + break; + } - fn with_span(&mut self, v: T) -> Spanned { - let span = self.eat().expect("expected token").span; - v.span_with(span) - } + let entry = if let Some(entry) = dict_entry(p) { + entry + } else { + p.diag_expected("value"); + continue; + }; - fn eof(&mut self) -> bool { - self.peek().is_none() - } + if let Some(key) = &entry.key { + comma_and_keyless = false; + p.deco(Decoration::DictKey.span_with(key.span)); + } - fn eat(&mut self) -> Option>> { - let token = self.peek()?; - self.peeked = None; - Some(token) - } + let behind = entry.expr.span.end; + dict.0.push(entry); - fn peek(&mut self) -> Option>> { - let tokens = &mut self.tokens; - let token = (*self.peeked.get_or_insert_with(|| tokens.next()))?; + p.skip_white(); + if p.eof() { + break; + } - // Check for unclosed groups. - if Group::is_delimiter(token.v) { - if self.delimiters.iter().rev().any(|&(_, end)| token.v == end) { - return None; - } + if p.eat_if(Token::Comma).is_none() { + p.diag_expected_at("comma", behind); } - Some(token) + comma_and_keyless = false; } - fn pos(&self) -> Pos { - self.peeked - .flatten() - .map(|s| s.span.start) - .unwrap_or_else(|| self.tokens.pos()) - } + let coercable = comma_and_keyless && !dict.0.is_empty(); + (dict, coercable) } -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum Group { - Paren, - Bracket, - Brace, - Subheader, -} +/// Parse a single entry in a dictionary. +fn dict_entry(p: &mut Parser) -> Option { + if let Some(ident) = ident(p) { + p.skip_white(); + match p.peek() { + // Key-value pair. + Some(Token::Equals) => { + p.eat_assert(Token::Equals); + p.skip_white(); + if let Some(expr) = expr(p) { + Some(LitDictEntry { + key: Some(ident.map(|id| DictKey::Str(id.0))), + expr, + }) + } else { + None + } + } -impl Group { - fn is_delimiter(token: Token<'_>) -> bool { - matches!( - token, - Token::RightParen | Token::RightBracket | Token::RightBrace | Token::Chain - ) - } + // Function call. + Some(Token::LeftParen) => Some(LitDictEntry { + key: None, + expr: paren_call(p, ident).map(Expr::Call), + }), - fn start(self) -> Option> { - match self { - Self::Paren => Some(Token::LeftParen), - Self::Bracket => Some(Token::LeftBracket), - Self::Brace => Some(Token::LeftBrace), - Self::Subheader => None, + // Just an identifier. + _ => Some(LitDictEntry { + key: None, + expr: ident.map(|id| Expr::Lit(Lit::Ident(id))), + }), } + } else if let Some(expr) = expr(p) { + Some(LitDictEntry { key: None, expr }) + } else { + None } +} - fn end(self) -> Token<'static> { - match self { - Self::Paren => Token::RightParen, - Self::Bracket => Token::RightBracket, - Self::Brace => Token::RightBrace, - Self::Subheader => Token::Chain, - } +/// Parse an identifier. +fn ident(p: &mut Parser) -> Option> { + p.eat_map(|token| match token { + Token::Ident(id) => Some(Ident(id.into())), + _ => None, + }) +} + +/// Parse a color. +fn color(p: &mut Parser, hex: &str, span: Span) -> Spanned { + RgbaColor::from_str(hex) + .unwrap_or_else(|_| { + // Heal color by assuming black. + p.diag(error!(span, "invalid color")); + RgbaColor::new_healed(0, 0, 0, 255) + }) + .span_with(span) +} + +/// Parse a string. +fn string(p: &mut Parser, token: TokenStr, span: Span) -> Spanned { + if !token.terminated { + p.diag_expected_at("quote", span.end); } + + resolve::resolve_string(token.string).span_with(span) } #[cfg(test)] diff --git a/src/parse/parser.rs b/src/parse/parser.rs new file mode 100644 index 00000000..d0735931 --- /dev/null +++ b/src/parse/parser.rs @@ -0,0 +1,292 @@ +use std::fmt::{self, Debug, Formatter}; + +use super::{Scanner, TokenMode, Tokens}; +use crate::diagnostic::Diagnostic; +use crate::syntax::{Decoration, Pos, Span, SpanWith, Spanned, Token}; +use crate::Feedback; + +/// A convenient token-based parser. +pub struct Parser<'s> { + tokens: Tokens<'s>, + modes: Vec, + groups: Vec<(Pos, Group)>, + f: Feedback, +} + +impl<'s> Parser<'s> { + /// Create a new parser for the source string. + pub fn new(src: &'s str) -> Self { + Self { + tokens: Tokens::new(src, TokenMode::Body), + modes: vec![], + groups: vec![], + f: Feedback::new(), + } + } + + /// Finish parsing and return the accumulated feedback. + pub fn finish(self) -> Feedback { + self.f + } + + /// Add a diagnostic to the feedback. + pub fn diag(&mut self, diag: Spanned) { + self.f.diagnostics.push(diag); + } + + /// Eat the next token and add a diagnostic that it was not expected thing. + pub fn diag_expected(&mut self, thing: &str) { + if let Some(found) = self.eat() { + self.diag(error!( + found.span, + "expected {}, found {}", + thing, + found.v.name(), + )); + } else { + self.diag_expected_at(thing, self.pos()); + } + } + + /// Add a diagnostic that the thing was expected at the given position. + pub fn diag_expected_at(&mut self, thing: &str, pos: Pos) { + self.diag(error!(pos, "expected {}", thing)); + } + + /// Add a diagnostic that the given token was unexpected. + pub fn diag_unexpected(&mut self, token: Spanned) { + self.diag(error!(token.span, "unexpected {}", token.v.name())); + } + + /// Add a decoration to the feedback. + pub fn deco(&mut self, deco: Spanned) { + self.f.decorations.push(deco); + } + + /// Update the token mode and push the previous mode onto a stack. + pub fn push_mode(&mut self, mode: TokenMode) { + self.modes.push(self.tokens.mode()); + self.tokens.set_mode(mode); + } + + /// Pop the topmost token mode from the stack. + /// + /// # Panics + /// This panics if there is no mode on the stack. + pub fn pop_mode(&mut self) { + self.tokens.set_mode(self.modes.pop().expect("no pushed mode")); + } + + /// Continues parsing in a group. + /// + /// When the end delimiter of the group is reached, all subsequent calls to + /// `eat()` and `peek()` return `None`. Parsing can only continue with + /// a matching call to `end_group`. + /// + /// # Panics + /// This panics if the next token does not start the given group. + pub fn start_group(&mut self, group: Group) { + let start = self.pos(); + match group { + Group::Paren => self.eat_assert(Token::LeftParen), + Group::Bracket => self.eat_assert(Token::LeftBracket), + Group::Brace => self.eat_assert(Token::LeftBrace), + Group::Subheader => {} + } + self.groups.push((start, group)); + } + + /// Ends the parsing of a group and returns the span of the whole group. + /// + /// # Panics + /// This panics if no group was started. + pub fn end_group(&mut self) -> Span { + debug_assert_eq!(self.peek(), None, "unfinished group"); + + let (start, group) = self.groups.pop().expect("unstarted group"); + let end = match group { + Group::Paren => Some(Token::RightParen), + Group::Bracket => Some(Token::RightBracket), + Group::Brace => Some(Token::RightBrace), + Group::Subheader => None, + }; + + if let Some(token) = end { + let next = self.tokens.clone().next().map(|s| s.v); + if next == Some(token) { + self.tokens.next(); + } else { + self.diag(error!(self.pos(), "expected {}", token.name())); + } + } + + Span::new(start, self.pos()) + } + + /// Consume the next token. + pub fn eat(&mut self) -> Option>> { + next_group_aware(&mut self.tokens, &self.groups) + } + + /// Consume the next token if it is the given one. + pub fn eat_if(&mut self, t: Token) -> Option>> { + // Don't call eat() twice if it suceeds. + // + // TODO: Benchmark this vs. the naive version. + let before = self.pos(); + let token = self.eat()?; + if token.v == t { + Some(token) + } else { + self.jump(before); + None + } + } + + /// Consume the next token if the closure maps to `Some`. + pub fn eat_map( + &mut self, + mut f: impl FnMut(Token<'s>) -> Option, + ) -> Option> { + let before = self.pos(); + let token = self.eat()?; + if let Some(t) = f(token.v) { + Some(t.span_with(token.span)) + } else { + self.jump(before); + None + } + } + + /// Consume the next token, debug-asserting that it is the given one. + pub fn eat_assert(&mut self, t: Token) { + let next = self.eat(); + debug_assert_eq!(next.map(|s| s.v), Some(t)); + } + + /// Consume tokens while the condition is true. + /// + /// Returns how many tokens were eaten. + pub fn eat_while(&mut self, mut f: impl FnMut(Token<'s>) -> bool) -> usize { + self.eat_until(|t| !f(t)) + } + + /// Consume tokens until the condition is true. + /// + /// Returns how many tokens were eaten. + pub fn eat_until(&mut self, mut f: impl FnMut(Token<'s>) -> bool) -> usize { + let mut count = 0; + let mut before = self.pos(); + while let Some(t) = self.eat() { + if f(t.v) { + // Undo the last eat by jumping. This prevents + // double-tokenization by not peeking all the time. + // + // TODO: Benchmark this vs. the naive peeking version. + self.jump(before); + break; + } + before = self.pos(); + count += 1; + } + count + } + + /// Peek at the next token without consuming it. + pub fn peek(&self) -> Option> { + next_group_aware(&mut self.tokens.clone(), &self.groups).map(|s| s.v) + } + + /// Checks whether the next token fulfills a condition. + /// + /// Returns `false` if there is no next token. + pub fn check(&self, f: impl FnMut(Token<'s>) -> bool) -> bool { + self.peek().map(f).unwrap_or(false) + } + + /// Whether the there is no next token. + pub fn eof(&self) -> bool { + self.peek().is_none() + } + + /// Skip whitespace tokens. + pub fn skip_white(&mut self) { + self.eat_while(|t| { + matches!(t, + Token::Space(_) | + Token::LineComment(_) | + Token::BlockComment(_)) + }); + } + + /// The position in the string at which the last token ends and next token + /// will start. + pub fn pos(&self) -> Pos { + self.tokens.pos() + } + + /// Jump to a position in the source string. + pub fn jump(&mut self, pos: Pos) { + self.tokens.jump(pos); + } + + /// The full source string. + pub fn src(&self) -> &'s str { + self.scanner().src() + } + + /// The part of the source string that is spanned by the given span. + pub fn get(&self, span: Span) -> &'s str { + self.scanner().get(span.start.to_usize() .. span.end.to_usize()) + } + + /// The underlying scanner. + pub fn scanner(&self) -> &Scanner<'s> { + self.tokens.scanner() + } +} + +/// Wraps `tokens.next()`, but is group-aware. +fn next_group_aware<'s>( + tokens: &mut Tokens<'s>, + groups: &[(Pos, Group)], +) -> Option>> { + let pos = tokens.pos(); + let token = tokens.next(); + + let group = match token?.v { + Token::RightParen => Group::Paren, + Token::RightBracket => Group::Bracket, + Token::RightBrace => Group::Brace, + Token::Chain => Group::Subheader, + _ => return token, + }; + + if groups.iter().rev().any(|&(_, g)| g == group) { + tokens.jump(pos); + None + } else { + token + } +} + +impl Debug for Parser<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + let s = self.scanner(); + write!(f, "Parser({}|{})", s.eaten(), s.rest()) + } +} + +/// A group, confined by optional start and end delimiters. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Group { + /// A parenthesized group: `(...)`. + Paren, + /// A bracketed group: `[...]`. + Bracket, + /// A curly-braced group: `{...}`. + Brace, + /// A group ended by a chained subheader or a closing bracket: + /// `... >>`, `...]`. + Subheader, +} diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index f9919373..6036a74e 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -3,7 +3,7 @@ use super::{is_newline, Scanner}; use crate::syntax::{Ident, NodeRaw}; -/// Resolves all escape sequences in a string. +/// Resolve all escape sequences in a string. pub fn resolve_string(string: &str) -> String { let mut out = String::with_capacity(string.len()); let mut s = Scanner::new(string); @@ -48,10 +48,10 @@ pub fn resolve_hex(sequence: &str) -> Option { u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) } -/// Resolves the language tag and trims the raw text. -pub fn resolve_raw(raw: &str, backticks: usize) -> NodeRaw { +/// Resolve the language tag and trims the raw text. +pub fn resolve_raw(text: &str, backticks: usize) -> NodeRaw { if backticks > 1 { - let (tag, inner) = split_at_lang_tag(raw); + let (tag, inner) = split_at_lang_tag(text); let (lines, had_newline) = trim_and_split_raw(inner); NodeRaw { lang: Ident::new(tag), @@ -61,7 +61,7 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> NodeRaw { } else { NodeRaw { lang: None, - lines: split_lines(raw), + lines: split_lines(text), inline: true, } } @@ -76,7 +76,7 @@ fn split_at_lang_tag(raw: &str) -> (&str, &str) { ) } -/// Trims raw text and splits it into lines. +/// Trim raw text and splits it into lines. /// /// Returns whether at least one newline was contained in `raw`. fn trim_and_split_raw(raw: &str) -> (Vec, bool) { @@ -101,7 +101,7 @@ fn trim_and_split_raw(raw: &str) -> (Vec, bool) { (lines, had_newline) } -/// Splits a string into a vector of lines +/// Split a string into a vector of lines /// (respecting Unicode, Unix, Mac and Windows line breaks). pub fn split_lines(text: &str) -> Vec { let mut s = Scanner::new(text); @@ -147,8 +147,8 @@ mod tests { #[test] fn test_split_at_lang_tag() { - fn test(raw: &str, lang: &str, inner: &str) { - assert_eq!(split_at_lang_tag(raw), (lang, inner)); + fn test(text: &str, lang: &str, inner: &str) { + assert_eq!(split_at_lang_tag(text), (lang, inner)); } test("typst it!", "typst", " it!"); @@ -161,8 +161,8 @@ mod tests { #[test] fn test_trim_raw() { - fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(trim_and_split_raw(raw).0, expected); + fn test(text: &str, expected: Vec<&str>) { + assert_eq!(trim_and_split_raw(text).0, expected); } test(" hi", vec!["hi"]); @@ -178,8 +178,8 @@ mod tests { #[test] fn test_split_lines() { - fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(split_lines(raw), expected); + fn test(text: &str, expected: Vec<&str>) { + assert_eq!(split_lines(text), expected); } test("raw\ntext", vec!["raw", "text"]); diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index 9447222d..6ff8c801 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -4,7 +4,8 @@ use std::fmt::{self, Debug, Formatter}; use std::slice::SliceIndex; use std::str::Chars; -/// A low-level featureful char scanner. +/// A low-level featureful char-based scanner. +#[derive(Clone)] pub struct Scanner<'s> { src: &'s str, iter: Chars<'s>, @@ -98,24 +99,22 @@ impl<'s> Scanner<'s> { /// Checks whether the next character fulfills a condition. /// - /// Returns `false` is there is no next character. + /// Returns `false` if there is no next character. pub fn check(&self, f: impl FnMut(char) -> bool) -> bool { self.peek().map(f).unwrap_or(false) } - /// Go back to the where the index says. - fn reset(&mut self) { - self.iter = self.src[self.index ..].chars(); + /// Whether the end of the source string is reached. + pub fn eof(&self) -> bool { + self.iter.as_str().is_empty() } -} -impl<'s> Scanner<'s> { - /// The current index in the string. + /// The current index in the source string. pub fn index(&self) -> usize { self.index } - /// The previous index in the string. + /// The previous index in the source string. pub fn prev_index(&self) -> usize { self.src[.. self.index] .chars() @@ -124,6 +123,17 @@ impl<'s> Scanner<'s> { .unwrap_or(0) } + /// Jump to an index in the source string. + pub fn jump(&mut self, index: usize) { + self.index = index; + self.reset(); + } + + /// The full source string. + pub fn src(&self) -> &'s str { + self.src + } + /// Slice a part out of the source string. pub fn get(&self, index: I) -> &'s str where @@ -132,11 +142,6 @@ impl<'s> Scanner<'s> { &self.src[index] } - /// The full source string. - pub fn src(&self) -> &'s str { - self.src - } - /// The full source string up to the current index. pub fn eaten(&self) -> &'s str { &self.src[.. self.index] @@ -151,6 +156,11 @@ impl<'s> Scanner<'s> { pub fn rest(&self) -> &'s str { &self.src[self.index ..] } + + /// Go back to the where the index says. + fn reset(&mut self) { + self.iter = self.src[self.index ..].chars(); + } } impl Debug for Scanner<'_> { diff --git a/src/parse/tests.rs b/src/parse/tests.rs index 9d6b673f..a1b1fb13 100644 --- a/src/parse/tests.rs +++ b/src/parse/tests.rs @@ -14,7 +14,7 @@ use crate::syntax::*; use Decoration::*; use SynNode::{ - Linebreak as L, Parbreak as P, Spacing as S, ToggleBolder as B, ToggleItalic as I, + Linebreak as L, Parbreak as P, Space as S, ToggleBolder as B, ToggleItalic as I, }; fn T(text: &str) -> SynNode { @@ -80,21 +80,21 @@ fn Str(string: &str) -> Expr { macro_rules! Dict { (@dict=$dict:expr,) => {}; - (@dict=$dict:expr, $key:expr => $value:expr $(, $($tts:tt)*)?) => {{ + (@dict=$dict:expr, $key:expr => $expr:expr $(, $($tts:tt)*)?) => {{ let key = Into::>::into($key); let key = key.map(Into::::into); - let value = Into::>::into($value); - $dict.0.push(LitDictEntry { key: Some(key), value }); + let expr = Into::>::into($expr); + $dict.0.push(LitDictEntry { key: Some(key), expr }); Dict![@dict=$dict, $($($tts)*)?]; }}; - (@dict=$dict:expr, $value:expr $(, $($tts:tt)*)?) => { - let value = Into::>::into($value); - $dict.0.push(LitDictEntry { key: None, value }); + (@dict=$dict:expr, $expr:expr $(, $($tts:tt)*)?) => { + let expr = Into::>::into($expr); + $dict.0.push(LitDictEntry { key: None, expr }); Dict![@dict=$dict, $($($tts)*)?]; }; (@$($tts:tt)*) => {{ #[allow(unused_mut)] - let mut dict = LitDict::default(); + let mut dict = LitDict::new(); Dict![@dict=dict, $($tts)*]; dict }}; @@ -344,7 +344,6 @@ fn test_parse_function_names() { fn test_parse_chaining() { // Things the parser has to make sense of t!("[hi: (5.0, 2.1 >> you]" => F!("hi"; Dict![Float(5.0), Float(2.1)], Tree![F!("you")])); - t!("[box >>][Hi]" => F!("box"; Tree![T("Hi")])); t!("[box >> pad: 1pt][Hi]" => F!("box"; Tree![ F!("pad"; Len(Length::pt(1.0)), Tree!(T("Hi"))) ])); @@ -354,7 +353,8 @@ fn test_parse_chaining() { // Errors for unclosed / empty predecessor groups e!("[hi: (5.0, 2.1 >> you]" => s(15, 15, "expected closing paren")); - e!("[>> abc]" => s(1, 1, "expected function name")); + e!("[>> abc]" => s(1, 1, "expected function name")); + e!("[box >>][Hi]" => s(7, 7, "expected function name")); } #[test] @@ -482,7 +482,7 @@ fn test_parse_expressions() { // Invalid expressions. v!("4pt--" => Len(Length::pt(4.0))); - e!("[val: 4pt--]" => s(10, 11, "dangling minus"), + e!("[val: 4pt--]" => s(10, 11, "missing factor"), s(6, 10, "missing right summand")); v!("3mm+4pt*" => Binary(Add, Len(Length::mm(3.0)), Len(Length::pt(4.0)))); diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 9f30f587..72d7b2d9 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,17 +1,19 @@ //! Tokenization. +use std::fmt::{self, Debug, Formatter}; + use super::{is_newline, Scanner}; use crate::length::Length; -use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token}; +use crate::syntax::token::*; +use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned}; use TokenMode::*; /// An iterator over the tokens of a string of source code. -#[derive(Debug)] +#[derive(Clone)] pub struct Tokens<'s> { s: Scanner<'s>, mode: TokenMode, - stack: Vec, } /// Whether to tokenize in header mode which yields expression, comma and @@ -26,23 +28,17 @@ pub enum TokenMode { impl<'s> Tokens<'s> { /// Create a new token iterator with the given mode. pub fn new(src: &'s str, mode: TokenMode) -> Self { - Self { - s: Scanner::new(src), - mode, - stack: vec![], - } + Self { s: Scanner::new(src), mode } } - /// Change the token mode and push the old one on a stack. - pub fn push_mode(&mut self, mode: TokenMode) { - self.stack.push(self.mode); - self.mode = mode; + /// Get the current token mode. + pub fn mode(&self) -> TokenMode { + self.mode } - /// Pop the old token mode from the stack. This panics if there is no mode - /// on the stack. - pub fn pop_mode(&mut self) { - self.mode = self.stack.pop().expect("no pushed mode"); + /// Change the token mode. + pub fn set_mode(&mut self, mode: TokenMode) { + self.mode = mode; } /// The position in the string at which the last token ends and next token @@ -50,6 +46,16 @@ impl<'s> Tokens<'s> { pub fn pos(&self) -> Pos { self.s.index().into() } + + /// Jump to a position in the source string. + pub fn jump(&mut self, pos: Pos) { + self.s.jump(pos.to_usize()); + } + + /// The underlying scanner. + pub fn scanner(&self) -> &Scanner<'s> { + &self.s + } } impl<'s> Iterator for Tokens<'s> { @@ -59,8 +65,12 @@ impl<'s> Iterator for Tokens<'s> { fn next(&mut self) -> Option { let start = self.s.index(); let token = match self.s.eat()? { - // Whitespace. - c if c.is_whitespace() => self.read_whitespace(c), + // Whitespace with fast path for just a single space. + ' ' if !self.s.check(|c| c.is_whitespace()) => Token::Space(0), + c if c.is_whitespace() => { + self.s.jump(start); + self.read_whitespace() + } // Comments. '/' if self.s.eat_if('/') => self.read_line_comment(), @@ -76,8 +86,8 @@ impl<'s> Iterator for Tokens<'s> { // Syntactic elements in body text. '*' if self.mode == Body => Token::Star, '_' if self.mode == Body => Token::Underscore, - '`' if self.mode == Body => self.read_raw(), '#' if self.mode == Body => Token::Hashtag, + '`' if self.mode == Body => self.read_raw(), '~' if self.mode == Body => Token::Text("\u{00A0}"), '\\' if self.mode == Body => self.read_escaped(), @@ -88,12 +98,12 @@ impl<'s> Iterator for Tokens<'s> { ',' if self.mode == Header => Token::Comma, '=' if self.mode == Header => Token::Equals, '>' if self.mode == Header && self.s.eat_if('>') => Token::Chain, - - // Expressions in headers. '+' if self.mode == Header => Token::Plus, '-' if self.mode == Header => Token::Hyphen, '*' if self.mode == Header => Token::Star, '/' if self.mode == Header => Token::Slash, + + // Expressions in headers. '#' if self.mode == Header => self.read_hex(), '"' if self.mode == Header => self.read_string(), @@ -107,18 +117,7 @@ impl<'s> Iterator for Tokens<'s> { } impl<'s> Tokens<'s> { - fn read_whitespace(&mut self, first: char) -> Token<'s> { - // Shortcut for common case of exactly one space. - if first == ' ' && !self.s.check(|c| c.is_whitespace()) { - return Token::Space(0); - } - - // Uneat the first char if it's a newline, so that it's counted in the - // loop. - if is_newline(first) { - self.s.uneat(); - } - + fn read_whitespace(&mut self) -> Token<'s> { // Count the number of newlines. let mut newlines = 0; while let Some(c) = self.s.eat_merging_crlf() { @@ -169,27 +168,6 @@ impl<'s> Tokens<'s> { Token::BlockComment(self.s.get(start .. end)) } - fn read_hex(&mut self) -> Token<'s> { - // This parses more than the permissable 0-9, a-f, A-F character ranges - // to provide nicer error messages later. - Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) - } - - fn read_string(&mut self) -> Token<'s> { - let mut escaped = false; - Token::Str { - string: self.s.eat_until(|c| { - if c == '"' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }), - terminated: self.s.eat_if('"'), - } - } - fn read_raw(&mut self) -> Token<'s> { let mut backticks = 1; while self.s.eat_if('`') { @@ -210,11 +188,11 @@ impl<'s> Tokens<'s> { let terminated = found == backticks; let end = self.s.index() - if terminated { found } else { 0 }; - Token::Raw { - raw: self.s.get(start .. end), + Token::Raw(TokenRaw { + text: self.s.get(start .. end), backticks, terminated, - } + }) } fn read_escaped(&mut self) -> Token<'s> { @@ -228,10 +206,10 @@ impl<'s> Tokens<'s> { 'u' if self.s.peek_nth(1) == Some('{') => { self.s.eat_assert('u'); self.s.eat_assert('{'); - Token::UnicodeEscape { + Token::UnicodeEscape(TokenUnicodeEscape { sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()), terminated: self.s.eat_if('}'), - } + }) } c if c.is_whitespace() => Token::Backslash, _ => Token::Text("\\"), @@ -241,6 +219,27 @@ impl<'s> Tokens<'s> { } } + fn read_hex(&mut self) -> Token<'s> { + // This parses more than the permissable 0-9, a-f, A-F character ranges + // to provide nicer error messages later. + Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) + } + + fn read_string(&mut self) -> Token<'s> { + let mut escaped = false; + Token::Str(TokenStr { + string: self.s.eat_until(|c| { + if c == '"' && !escaped { + true + } else { + escaped = c == '\\' && !escaped; + false + } + }), + terminated: self.s.eat_if('"'), + }) + } + fn read_text_or_expr(&mut self, start: usize) -> Token<'s> { let body = self.mode == Body; let header = self.mode == Header; @@ -268,6 +267,12 @@ impl<'s> Tokens<'s> { } } +impl Debug for Tokens<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Tokens({}|{})", self.s.eaten(), self.s.rest()) + } +} + fn parse_expr(text: &str) -> Token<'_> { if let Ok(b) = text.parse::() { Token::Bool(b) @@ -303,13 +308,13 @@ mod tests { }; fn Str(string: &str, terminated: bool) -> Token { - Token::Str { string, terminated } + Token::Str(TokenStr { string, terminated }) } - fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { - Token::Raw { raw, backticks, terminated } + fn Raw(text: &str, backticks: usize, terminated: bool) -> Token { + Token::Raw(TokenRaw { text, backticks, terminated }) } fn UE(sequence: &str, terminated: bool) -> Token { - Token::UnicodeEscape { sequence, terminated } + Token::UnicodeEscape(TokenUnicodeEscape { sequence, terminated }) } macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } @@ -388,37 +393,66 @@ mod tests { t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true)); } + #[test] + fn tokenize_escaped_symbols() { + t!(Body, r"\\" => T(r"\")); + t!(Body, r"\[" => T("[")); + t!(Body, r"\]" => T("]")); + t!(Body, r"\*" => T("*")); + t!(Body, r"\_" => T("_")); + t!(Body, r"\`" => T("`")); + t!(Body, r"\/" => T("/")); + t!(Body, r"\u{2603}" => UE("2603", true)); + t!(Body, r"\u{26A4" => UE("26A4", false)); + t!(Body, r#"\""# => T("\"")); + } + + #[test] + fn tokenize_unescapable_symbols() { + t!(Body, r"\a" => T("\\"), T("a")); + t!(Body, r"\:" => T(r"\"), T(":")); + t!(Body, r"\=" => T(r"\"), T("=")); + t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); + t!(Body, r"\u{ " => UE("", false), Space(0)); + t!(Body, r"\u" => T("\\"), T("u")); + t!(Header, r"\\\\" => Invalid(r"\\\\")); + t!(Header, r"\a" => Invalid(r"\a")); + t!(Header, r"\:" => Invalid(r"\"), Colon); + t!(Header, r"\=" => Invalid(r"\"), Equals); + t!(Header, r"\," => Invalid(r"\"), Comma); + } + #[test] fn tokenize_header_tokens() { - t!(Header, "__main__" => Id("__main__")); - t!(Header, "_func_box" => Id("_func_box")); - t!(Header, ">main" => Invalid(">main")); - t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); - t!(Header, "{abc}" => LB, Id("abc"), RB); - t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); - t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); - t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); - t!(Header, "=3.14" => Equals, Num(3.14)); - t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); - t!(Header, "a:b" => Id("a"), Colon, Id("b")); - t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); - t!(Body, "c=d, " => T("c=d,"), S(0)); - t!(Body, "a: b" => T("a:"), S(0), T("b")); - t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), - Id("x"), Equals, Num(1.0)); + t!(Header, "__main__" => Id("__main__")); + t!(Header, "_func_box" => Id("_func_box")); + t!(Header, ">main" => Invalid(">main")); + t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); + t!(Header, "{abc}" => LB, Id("abc"), RB); + t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); + t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); + t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); + t!(Header, "=3.14" => Equals, Num(3.14)); + t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); + t!(Header, "a:b" => Id("a"), Colon, Id("b")); + t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); + t!(Body, "c=d, " => T("c=d,"), S(0)); + t!(Body, "a: b" => T("a:"), S(0), T("b")); + t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), + Id("x"), Equals, Num(1.0)); } #[test] fn tokenize_numeric_values() { - t!(Header, "12.3e5" => Num(12.3e5)); - t!(Header, "120%" => Num(1.2)); - t!(Header, "12e4%" => Num(1200.0)); - t!(Header, "1e5in" => Len(Length::inches(100000.0))); - t!(Header, "2.3cm" => Len(Length::cm(2.3))); - t!(Header, "02.4mm" => Len(Length::mm(2.4))); - t!(Header, "2.4.cm" => Invalid("2.4.cm")); - t!(Header, "#6ae6dd" => Hex("6ae6dd")); - t!(Header, "#8A083c" => Hex("8A083c")); + t!(Header, "12.3e5" => Num(12.3e5)); + t!(Header, "120%" => Num(1.2)); + t!(Header, "12e4%" => Num(1200.0)); + t!(Header, "1e5in" => Len(Length::inches(100000.0))); + t!(Header, "2.3cm" => Len(Length::cm(2.3))); + t!(Header, "02.4mm" => Len(Length::mm(2.4))); + t!(Header, "2.4.cm" => Invalid("2.4.cm")); + t!(Header, "#6ae6dd" => Hex("6ae6dd")); + t!(Header, "#8A083c" => Hex("8A083c")); } #[test] @@ -446,35 +480,6 @@ mod tests { S(0), Slash, S(0), Num(2.1)); } - #[test] - fn tokenize_escaped_symbols() { - t!(Body, r"\\" => T(r"\")); - t!(Body, r"\[" => T("[")); - t!(Body, r"\]" => T("]")); - t!(Body, r"\*" => T("*")); - t!(Body, r"\_" => T("_")); - t!(Body, r"\`" => T("`")); - t!(Body, r"\/" => T("/")); - t!(Body, r"\u{2603}" => UE("2603", true)); - t!(Body, r"\u{26A4" => UE("26A4", false)); - t!(Body, r#"\""# => T("\"")); - } - - #[test] - fn tokenize_unescapable_symbols() { - t!(Body, r"\a" => T("\\"), T("a")); - t!(Body, r"\:" => T(r"\"), T(":")); - t!(Body, r"\=" => T(r"\"), T("=")); - t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); - t!(Body, r"\u{ " => UE("", false), Space(0)); - t!(Body, r"\u" => T("\\"), T("u")); - t!(Header, r"\\\\" => Invalid(r"\\\\")); - t!(Header, r"\a" => Invalid(r"\a")); - t!(Header, r"\:" => Invalid(r"\"), Colon); - t!(Header, r"\=" => Invalid(r"\"), Equals); - t!(Header, r"\," => Invalid(r"\"), Comma); - } - #[test] fn tokenize_with_spans() { ts!(Body, "hello" => s(0, 5, T("hello"))); -- cgit v1.2.3