diff options
| author | Laurenz <laurmaedje@gmail.com> | 2021-11-08 13:08:15 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-11-08 13:08:15 +0100 |
| commit | c6f8ad35f45248f1fd36ee00195966f1629c6ca7 (patch) | |
| tree | 51faa3f6bbc56f75636823adeea135ed76e1b33b /src/parse | |
| parent | ea6ee3f667e922ed2f21b08719a45d2395787932 (diff) | |
| parent | 38c5c362419c5eee7a4fdc0b43d3a9dfb339a6d2 (diff) | |
Merge pull request #46 from typst/parser-ng
Next Generation Parser
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/mod.rs | 963 | ||||
| -rw-r--r-- | src/parse/parser.rs | 556 | ||||
| -rw-r--r-- | src/parse/resolve.rs | 21 | ||||
| -rw-r--r-- | src/parse/scanner.rs | 45 | ||||
| -rw-r--r-- | src/parse/tokens.rs | 618 |
5 files changed, 1141 insertions, 1062 deletions
diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 30787423..f9c0049f 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -12,216 +12,162 @@ pub use tokens::*; use std::rc::Rc; -use crate::diag::TypResult; -use crate::source::SourceFile; -use crate::syntax::*; -use crate::util::EcoString; +use crate::syntax::ast::{Associativity, BinOp, UnOp}; +use crate::syntax::{ErrorPos, Green, GreenNode, NodeKind}; /// Parse a source file. -pub fn parse(source: &SourceFile) -> TypResult<Markup> { - let mut p = Parser::new(source); - let markup = markup(&mut p); - let errors = p.finish(); - if errors.is_empty() { - Ok(markup) - } else { - Err(Box::new(errors)) +pub fn parse(src: &str) -> Rc<GreenNode> { + let mut p = Parser::new(src); + markup(&mut p); + match p.finish().into_iter().next() { + Some(Green::Node(node)) => node, + _ => unreachable!(), } } /// Parse markup. -fn markup(p: &mut Parser) -> Markup { +fn markup(p: &mut Parser) { markup_while(p, true, &mut |_| true) } -/// Parse markup that stays equal or right of the given column. -fn markup_indented(p: &mut Parser, column: usize) -> Markup { +/// Parse markup that stays right of the given column. +fn markup_indented(p: &mut Parser, column: usize) { p.eat_while(|t| match t { - Token::Space(n) => n == 0, - Token::LineComment(_) | Token::BlockComment(_) => true, + NodeKind::Space(n) => *n == 0, + NodeKind::LineComment | NodeKind::BlockComment => true, _ => false, }); markup_while(p, false, &mut |p| match p.peek() { - Some(Token::Space(n)) if n >= 1 => p.column(p.next_end()) >= column, + Some(NodeKind::Space(n)) if *n >= 1 => p.column(p.current_end()) >= column, _ => true, }) } -/// Parse a syntax tree while the peeked token satisifies a condition. +/// Parse a syntax tree while the peeked NodeKind satisifies a condition. /// /// If `at_start` is true, things like headings that may only appear at the /// beginning of a line or template are allowed. -fn markup_while<F>(p: &mut Parser, mut at_start: bool, f: &mut F) -> Markup +fn markup_while<F>(p: &mut Parser, mut at_start: bool, f: &mut F) where F: FnMut(&mut Parser) -> bool, { - let mut tree = vec![]; - while !p.eof() && f(p) { - if let Some(node) = markup_node(p, &mut at_start) { - at_start &= matches!(node, MarkupNode::Space | MarkupNode::Parbreak(_)); - tree.push(node); + p.perform(NodeKind::Markup, |p| { + while !p.eof() && f(p) { + markup_node(p, &mut at_start); } - } - - tree + }); } /// Parse a markup node. -fn markup_node(p: &mut Parser, at_start: &mut bool) -> Option<MarkupNode> { - let token = p.peek()?; - let span = p.peek_span(); - let node = match token { +fn markup_node(p: &mut Parser, at_start: &mut bool) { + let token = match p.peek() { + Some(t) => t, + None => return, + }; + + match token { // Whitespace. - Token::Space(newlines) => { - *at_start |= newlines > 0; - if newlines < 2 { - MarkupNode::Space + NodeKind::Space(newlines) => { + *at_start |= *newlines > 0; + if *newlines < 2 { + p.eat(); } else { - MarkupNode::Parbreak(span) + p.convert(NodeKind::Parbreak); } + return; } - // Text. - Token::Text(text) => MarkupNode::Text(text.into()), - Token::Tilde => MarkupNode::Text("\u{00A0}".into()), - Token::HyphHyph => MarkupNode::Text("\u{2013}".into()), - Token::HyphHyphHyph => MarkupNode::Text("\u{2014}".into()), - Token::UnicodeEscape(t) => MarkupNode::Text(unicode_escape(p, t)), + // Comments. + NodeKind::LineComment | NodeKind::BlockComment => { + p.eat(); + return; + } + + // Text and markup. + NodeKind::Text(_) + | NodeKind::EnDash + | NodeKind::EmDash + | NodeKind::NonBreakingSpace + | NodeKind::Emph + | NodeKind::Strong + | NodeKind::Linebreak + | NodeKind::Raw(_) + | NodeKind::Math(_) + | NodeKind::UnicodeEscape(_) => { + p.eat(); + } - // Markup. - Token::Backslash => MarkupNode::Linebreak(span), - Token::Star => MarkupNode::Strong(span), - Token::Underscore => MarkupNode::Emph(span), - Token::Raw(t) => raw(p, t), - Token::Eq if *at_start => return Some(heading(p)), - Token::Hyph if *at_start => return Some(list_node(p)), - Token::Numbering(number) if *at_start => return Some(enum_node(p, number)), + NodeKind::Eq if *at_start => heading(p), + NodeKind::Minus if *at_start => list_node(p), + NodeKind::EnumNumbering(_) if *at_start => enum_node(p), // Line-based markup that is not currently at the start of the line. - Token::Eq | Token::Hyph | Token::Numbering(_) => { - MarkupNode::Text(p.peek_src().into()) + NodeKind::Eq | NodeKind::Minus | NodeKind::EnumNumbering(_) => { + p.convert(NodeKind::Text(p.peek_src().into())); } // Hashtag + keyword / identifier. - Token::Ident(_) - | Token::Let - | Token::If - | Token::While - | Token::For - | Token::Import - | Token::Include => { - let stmt = matches!(token, Token::Let | Token::Import); + NodeKind::Ident(_) + | NodeKind::Let + | NodeKind::If + | NodeKind::While + | NodeKind::For + | NodeKind::Import + | NodeKind::Include => { + let stmt = matches!(token, NodeKind::Let | NodeKind::Import); let group = if stmt { Group::Stmt } else { Group::Expr }; - p.start_group(group, TokenMode::Code); - let expr = expr_with(p, true, 0); - if stmt && expr.is_some() && !p.eof() { - p.expected_at(p.prev_end(), "semicolon or line break"); + p.start_group(group); + let res = expr_prec(p, true, 0); + if stmt && res.is_ok() && !p.eof() { + p.expected_at("semicolon or line break"); } p.end_group(); - - return expr.map(MarkupNode::Expr); } // Block and template. - Token::LeftBrace => return Some(MarkupNode::Expr(block(p))), - Token::LeftBracket => return Some(MarkupNode::Expr(template(p))), - - // Comments. - Token::LineComment(_) | Token::BlockComment(_) => { - p.eat(); - return None; - } + NodeKind::LeftBrace => block(p), + NodeKind::LeftBracket => template(p), - _ => { - *at_start = false; - p.unexpected(); - return None; - } + NodeKind::Error(_, _) => p.eat(), + _ => p.unexpected(), }; - p.eat(); - Some(node) -} -/// Handle a unicode escape sequence. -fn unicode_escape(p: &mut Parser, token: UnicodeEscapeToken) -> EcoString { - let span = p.peek_span(); - let text = if let Some(c) = resolve::resolve_hex(token.sequence) { - c.into() - } else { - // Print out the escape sequence verbatim if it is invalid. - p.error(span, "invalid unicode escape sequence"); - p.peek_src().into() - }; - - if !token.terminated { - p.error(span.end, "expected closing brace"); - } - - text -} - -/// Handle a raw block. -fn raw(p: &mut Parser, token: RawToken) -> MarkupNode { - let column = p.column(p.next_start()); - let span = p.peek_span(); - let raw = resolve::resolve_raw(span, column, token.backticks, token.text); - if !token.terminated { - p.error(span.end, "expected backtick(s)"); - } - MarkupNode::Raw(Box::new(raw)) + *at_start = false; } /// Parse a heading. -fn heading(p: &mut Parser) -> MarkupNode { - let start = p.next_start(); - p.eat_assert(Token::Eq); - - // Count depth. - let mut level: usize = 1; - while p.eat_if(Token::Eq) { - level += 1; - } - - if level > 6 { - return MarkupNode::Text(p.get(start .. p.prev_end()).into()); - } - - let column = p.column(p.prev_end()); - let body = markup_indented(p, column); - MarkupNode::Heading(Box::new(HeadingNode { - span: p.span_from(start), - level, - body, - })) +fn heading(p: &mut Parser) { + p.perform(NodeKind::Heading, |p| { + p.eat_assert(&NodeKind::Eq); + while p.eat_if(&NodeKind::Eq) {} + let column = p.column(p.prev_end()); + markup_indented(p, column); + }); } /// Parse a single list item. -fn list_node(p: &mut Parser) -> MarkupNode { - let start = p.next_start(); - p.eat_assert(Token::Hyph); - let column = p.column(p.prev_end()); - let body = markup_indented(p, column); - MarkupNode::List(Box::new(ListNode { span: p.span_from(start), body })) +fn list_node(p: &mut Parser) { + p.perform(NodeKind::List, |p| { + p.eat_assert(&NodeKind::Minus); + let column = p.column(p.prev_end()); + markup_indented(p, column); + }); } /// Parse a single enum item. -fn enum_node(p: &mut Parser, number: Option<usize>) -> MarkupNode { - let start = p.next_start(); - p.eat_assert(Token::Numbering(number)); - let column = p.column(p.prev_end()); - let body = markup_indented(p, column); - MarkupNode::Enum(Box::new(EnumNode { - span: p.span_from(start), - number, - body, - })) +fn enum_node(p: &mut Parser) { + p.perform(NodeKind::Enum, |p| { + p.eat(); + let column = p.column(p.prev_end()); + markup_indented(p, column); + }); } /// Parse an expression. -fn expr(p: &mut Parser) -> Option<Expr> { - expr_with(p, false, 0) +fn expr(p: &mut Parser) -> ParseResult { + expr_prec(p, false, 0) } /// Parse an expression with operators having at least the minimum precedence. @@ -231,13 +177,16 @@ fn expr(p: &mut Parser) -> Option<Expr> { /// in markup. /// /// Stops parsing at operations with lower precedence than `min_prec`, -fn expr_with(p: &mut Parser, atomic: bool, min_prec: usize) -> Option<Expr> { - let start = p.next_start(); - let mut lhs = match p.eat_map(UnOp::from_token) { +fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { + let marker = p.marker(); + + // Start the unary expression. + match p.peek().and_then(UnOp::from_token) { Some(op) => { + p.eat(); let prec = op.precedence(); - let expr = expr_with(p, atomic, prec)?; - Expr::Unary(Box::new(UnaryExpr { span: p.span_from(start), op, expr })) + expr_prec(p, atomic, prec)?; + marker.end(p, NodeKind::Unary); } None => primary(p, atomic)?, }; @@ -245,19 +194,19 @@ fn expr_with(p: &mut Parser, atomic: bool, min_prec: usize) -> Option<Expr> { loop { // Exclamation mark, parenthesis or bracket means this is a function // call. - if matches!(p.peek_direct(), Some(Token::LeftParen | Token::LeftBracket)) { - lhs = call(p, lhs)?; + if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { + call(p, marker)?; continue; } - if p.eat_if(Token::With) { - lhs = with_expr(p, lhs)?; - } - if atomic { break; } + if p.at(&NodeKind::With) { + with_expr(p, marker)?; + } + let op = match p.peek().and_then(BinOp::from_token) { Some(binop) => binop, None => break, @@ -269,96 +218,94 @@ fn expr_with(p: &mut Parser, atomic: bool, min_prec: usize) -> Option<Expr> { } p.eat(); + match op.associativity() { Associativity::Left => prec += 1, Associativity::Right => {} } - let rhs = match expr_with(p, atomic, prec) { - Some(rhs) => rhs, - None => break, - }; - - let span = lhs.span().join(rhs.span()); - lhs = Expr::Binary(Box::new(BinaryExpr { span, lhs, op, rhs })); + marker.perform(p, NodeKind::Binary, |p| expr_prec(p, atomic, prec))?; } - Some(lhs) + Ok(()) } /// Parse a primary expression. -fn primary(p: &mut Parser, atomic: bool) -> Option<Expr> { - if let Some(expr) = literal(p) { - return Some(expr); +fn primary(p: &mut Parser, atomic: bool) -> ParseResult { + if literal(p) { + return Ok(()); } match p.peek() { // Things that start with an identifier. - Some(Token::Ident(string)) => { - let ident = Ident { - span: p.eat_span(), - string: string.into(), - }; + Some(NodeKind::Ident(_)) => { + let marker = p.marker(); + p.eat(); // Arrow means this is a closure's lone parameter. - Some(if !atomic && p.eat_if(Token::Arrow) { - let body = expr(p)?; - Expr::Closure(Box::new(ClosureExpr { - span: ident.span.join(body.span()), - name: None, - params: vec![ClosureParam::Pos(ident)], - body: Rc::new(body), - })) + if !atomic && p.at(&NodeKind::Arrow) { + marker.end(p, NodeKind::ClosureParams); + p.eat_assert(&NodeKind::Arrow); + marker.perform(p, NodeKind::Closure, expr) } else { - Expr::Ident(Box::new(ident)) - }) + Ok(()) + } } // Structures. - Some(Token::LeftParen) => parenthesized(p), - Some(Token::LeftBracket) => Some(template(p)), - Some(Token::LeftBrace) => Some(block(p)), + Some(NodeKind::LeftParen) => parenthesized(p), + Some(NodeKind::LeftBracket) => { + template(p); + Ok(()) + } + Some(NodeKind::LeftBrace) => { + block(p); + Ok(()) + } // Keywords. - Some(Token::Let) => let_expr(p), - Some(Token::If) => if_expr(p), - Some(Token::While) => while_expr(p), - Some(Token::For) => for_expr(p), - Some(Token::Import) => import_expr(p), - Some(Token::Include) => include_expr(p), + Some(NodeKind::Let) => let_expr(p), + Some(NodeKind::If) => if_expr(p), + Some(NodeKind::While) => while_expr(p), + Some(NodeKind::For) => for_expr(p), + Some(NodeKind::Import) => import_expr(p), + Some(NodeKind::Include) => include_expr(p), + + Some(NodeKind::Error(_, _)) => { + p.eat(); + Err(()) + } // Nothing. _ => { p.expected("expression"); - None + Err(()) } } } /// Parse a literal. -fn literal(p: &mut Parser) -> Option<Expr> { - let span = p.peek_span(); - let lit = match p.peek()? { +fn literal(p: &mut Parser) -> bool { + match p.peek() { // Basic values. - Token::None => Lit::None(span), - Token::Auto => Lit::Auto(span), - Token::Bool(b) => Lit::Bool(span, b), - Token::Int(i) => Lit::Int(span, i), - Token::Float(f) => Lit::Float(span, f), - Token::Length(val, unit) => Lit::Length(span, val, unit), - Token::Angle(val, unit) => Lit::Angle(span, val, unit), - Token::Percent(p) => Lit::Percent(span, p), - Token::Fraction(p) => Lit::Fractional(span, p), - Token::Str(token) => Lit::Str(span, { - if !token.terminated { - p.expected_at(span.end, "quote"); - } - resolve::resolve_string(token.string) - }), - _ => return None, - }; - p.eat(); - Some(Expr::Lit(Box::new(lit))) + Some( + NodeKind::None + | NodeKind::Auto + | NodeKind::Int(_) + | NodeKind::Float(_) + | NodeKind::Bool(_) + | NodeKind::Fraction(_) + | NodeKind::Length(_, _) + | NodeKind::Angle(_, _) + | NodeKind::Percentage(_) + | NodeKind::Str(_), + ) => { + p.eat(); + true + } + + _ => false, + } } /// Parse something that starts with a parenthesis, which can be either of: @@ -366,433 +313,395 @@ fn literal(p: &mut Parser) -> Option<Expr> { /// - Dictionary literal /// - Parenthesized expression /// - Parameter list of closure expression -fn parenthesized(p: &mut Parser) -> Option<Expr> { - p.start_group(Group::Paren, TokenMode::Code); - let colon = p.eat_if(Token::Colon); - let (items, has_comma) = collection(p); - let span = p.end_group(); +fn parenthesized(p: &mut Parser) -> ParseResult { + let marker = p.marker(); - // Leading colon makes this a dictionary. + p.start_group(Group::Paren); + let colon = p.eat_if(&NodeKind::Colon); + let kind = collection(p).0; + p.end_group(); + + // Leading colon makes this a (empty) dictionary. if colon { - return Some(dict(p, items, span)); + dict(p, marker); + return Ok(()); } // Arrow means this is a closure's parameter list. - if p.eat_if(Token::Arrow) { - let params = params(p, items); - let body = expr(p)?; - return Some(Expr::Closure(Box::new(ClosureExpr { - span: span.join(body.span()), - name: None, - params, - body: Rc::new(body), - }))); + if p.at(&NodeKind::Arrow) { + params(p, marker); + p.eat_assert(&NodeKind::Arrow); + return marker.perform(p, NodeKind::Closure, expr); } - // Find out which kind of collection this is. - Some(match items.as_slice() { - [] => array(p, items, span), - [CallArg::Pos(_)] if !has_comma => match items.into_iter().next() { - Some(CallArg::Pos(expr)) => Expr::Group(Box::new(GroupExpr { span, expr })), - _ => unreachable!(), - }, - [CallArg::Pos(_), ..] => array(p, items, span), - [CallArg::Named(_), ..] => dict(p, items, span), - [CallArg::Spread(expr), ..] => { - p.error(expr.span(), "spreading is not allowed here"); - return None; - } - }) + // Transform into the identified collection. + match kind { + CollectionKind::Group => marker.end(p, NodeKind::Group), + CollectionKind::Positional => array(p, marker), + CollectionKind::Named => dict(p, marker), + } + + Ok(()) +} + +/// The type of a collection. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum CollectionKind { + /// The collection is only one item and has no comma. + Group, + /// The collection starts with a positional and has more items or a trailing + /// comma. + Positional, + /// The collection starts with a named item. + Named, } /// Parse a collection. /// -/// Returns whether the literal contained any commas. -fn collection(p: &mut Parser) -> (Vec<CallArg>, bool) { - let mut items = vec![]; - let mut has_comma = false; - let mut missing_coma = None; +/// Returns the length of the collection and whether the literal contained any +/// commas. +fn collection(p: &mut Parser) -> (CollectionKind, usize) { + let mut kind = CollectionKind::Positional; + let mut items = 0; + let mut can_group = true; + let mut error = false; + let mut missing_coma: Option<Marker> = None; while !p.eof() { - if let Some(arg) = item(p) { - items.push(arg); + if let Ok(item_kind) = item(p) { + if items == 0 && item_kind == NodeKind::Named { + kind = CollectionKind::Named; + can_group = false; + } - if let Some(pos) = missing_coma.take() { - p.expected_at(pos, "comma"); + if item_kind == NodeKind::Spread { + can_group = false; + } + + items += 1; + + if let Some(marker) = missing_coma.take() { + marker.expected(p, "comma"); } if p.eof() { break; } - let behind = p.prev_end(); - if p.eat_if(Token::Comma) { - has_comma = true; + if p.eat_if(&NodeKind::Comma) { + can_group = false; } else { - missing_coma = Some(behind); + missing_coma = Some(p.trivia_start()); } + } else { + error = true; } } - (items, has_comma) + if error || (can_group && items == 1) { + kind = CollectionKind::Group; + } + + (kind, items) } -/// Parse an expression or a named pair. -fn item(p: &mut Parser) -> Option<CallArg> { - if p.eat_if(Token::Dots) { - return expr(p).map(CallArg::Spread); +/// Parse an expression or a named pair, returning whether it's a spread or a +/// named pair. +fn item(p: &mut Parser) -> ParseResult<NodeKind> { + let marker = p.marker(); + if p.eat_if(&NodeKind::Dots) { + marker.perform(p, NodeKind::Spread, expr)?; + return Ok(NodeKind::Spread); } - let first = expr(p)?; - if p.eat_if(Token::Colon) { - if let Expr::Ident(name) = first { - Some(CallArg::Named(Named { name: *name, expr: expr(p)? })) - } else { - p.error(first.span(), "expected identifier"); - expr(p); - None - } + expr(p)?; + + if p.at(&NodeKind::Colon) { + marker.perform(p, NodeKind::Named, |p| { + if let Some(NodeKind::Ident(_)) = marker.peek(p).map(|c| c.kind()) { + p.eat(); + expr(p) + } else { + let error = NodeKind::Error(ErrorPos::Full, "expected identifier".into()); + marker.end(p, error); + p.eat(); + expr(p).ok(); + Err(()) + } + })?; + + Ok(NodeKind::Named) } else { - Some(CallArg::Pos(first)) + Ok(NodeKind::None) } } /// Convert a collection into an array, producing errors for anything other than /// expressions. -fn array(p: &mut Parser, items: Vec<CallArg>, span: Span) -> Expr { - let iter = items.into_iter().filter_map(|item| match item { - CallArg::Pos(expr) => Some(expr), - CallArg::Named(_) => { - p.error(item.span(), "expected expression, found named pair"); - None - } - CallArg::Spread(_) => { - p.error(item.span(), "spreading is not allowed here"); - None - } +fn array(p: &mut Parser, marker: Marker) { + marker.filter_children(p, |x| match x.kind() { + NodeKind::Named => Err("expected expression, found named pair"), + NodeKind::Spread => Err("spreading is not allowed here"), + _ => Ok(()), }); - Expr::Array(Box::new(ArrayExpr { span, items: iter.collect() })) + marker.end(p, NodeKind::Array); } /// Convert a collection into a dictionary, producing errors for anything other /// than named pairs. -fn dict(p: &mut Parser, items: Vec<CallArg>, span: Span) -> Expr { - let iter = items.into_iter().filter_map(|item| match item { - CallArg::Named(named) => Some(named), - CallArg::Pos(_) => { - p.error(item.span(), "expected named pair, found expression"); - None - } - CallArg::Spread(_) => { - p.error(item.span(), "spreading is not allowed here"); - None - } +fn dict(p: &mut Parser, marker: Marker) { + marker.filter_children(p, |x| match x.kind() { + kind if kind.is_paren() => Ok(()), + NodeKind::Named | NodeKind::Comma | NodeKind::Colon => Ok(()), + NodeKind::Spread => Err("spreading is not allowed here"), + _ => Err("expected named pair, found expression"), }); - Expr::Dict(Box::new(DictExpr { span, items: iter.collect() })) + marker.end(p, NodeKind::Dict); } /// Convert a collection into a list of parameters, producing errors for /// anything other than identifiers, spread operations and named pairs. -fn params(p: &mut Parser, items: Vec<CallArg>) -> Vec<ClosureParam> { - let iter = items.into_iter().filter_map(|item| match item { - CallArg::Pos(Expr::Ident(ident)) => Some(ClosureParam::Pos(*ident)), - CallArg::Named(named) => Some(ClosureParam::Named(named)), - CallArg::Spread(Expr::Ident(ident)) => Some(ClosureParam::Sink(*ident)), - _ => { - p.error(item.span(), "expected identifier"); - None - } +fn params(p: &mut Parser, marker: Marker) { + marker.filter_children(p, |x| match x.kind() { + kind if kind.is_paren() => Ok(()), + NodeKind::Named | NodeKind::Comma | NodeKind::Ident(_) => Ok(()), + NodeKind::Spread + if matches!( + x.children().last().map(|child| child.kind()), + Some(&NodeKind::Ident(_)) + ) => + { + Ok(()) + } + _ => Err("expected identifier"), }); - iter.collect() -} - -/// Convert a collection into a list of identifiers, producing errors for -/// anything other than identifiers. -fn idents(p: &mut Parser, items: Vec<CallArg>) -> Vec<Ident> { - let iter = items.into_iter().filter_map(|item| match item { - CallArg::Pos(Expr::Ident(ident)) => Some(*ident), - _ => { - p.error(item.span(), "expected identifier"); - None - } - }); - iter.collect() + marker.end(p, NodeKind::ClosureParams); } // Parse a template block: `[...]`. -fn template(p: &mut Parser) -> Expr { - p.start_group(Group::Bracket, TokenMode::Markup); - let tree = markup(p); - let span = p.end_group(); - Expr::Template(Box::new(TemplateExpr { span, body: tree })) +fn template(p: &mut Parser) { + p.perform(NodeKind::Template, |p| { + p.start_group(Group::Bracket); + markup(p); + p.end_group(); + }); } /// Parse a code block: `{...}`. -fn block(p: &mut Parser) -> Expr { - p.start_group(Group::Brace, TokenMode::Code); - let mut exprs = vec![]; - while !p.eof() { - p.start_group(Group::Stmt, TokenMode::Code); - if let Some(expr) = expr(p) { - exprs.push(expr); - if !p.eof() { - p.expected_at(p.prev_end(), "semicolon or line break"); +fn block(p: &mut Parser) { + p.perform(NodeKind::Block, |p| { + p.start_group(Group::Brace); + while !p.eof() { + p.start_group(Group::Stmt); + if expr(p).is_ok() && !p.eof() { + p.expected_at("semicolon or line break"); } + p.end_group(); + + // Forcefully skip over newlines since the group's contents can't. + p.eat_while(|t| matches!(t, NodeKind::Space(_))); } p.end_group(); - - // Forcefully skip over newlines since the group's contents can't. - p.eat_while(|t| matches!(t, Token::Space(_))); - } - let span = p.end_group(); - Expr::Block(Box::new(BlockExpr { span, exprs })) + }); } /// Parse a function call. -fn call(p: &mut Parser, callee: Expr) -> Option<Expr> { - let mut args = match p.peek_direct() { - Some(Token::LeftParen) => args(p), - Some(Token::LeftBracket) => CallArgs { - span: Span::at(p.id(), callee.span().end), - items: vec![], - }, +fn call(p: &mut Parser, callee: Marker) -> ParseResult { + callee.perform(p, NodeKind::Call, |p| match p.peek_direct() { + Some(NodeKind::LeftParen | NodeKind::LeftBracket) => { + args(p, true); + Ok(()) + } _ => { - p.expected_at(p.prev_end(), "argument list"); - return None; + p.expected_at("argument list"); + Err(()) } - }; - - while p.peek_direct() == Some(Token::LeftBracket) { - let body = template(p); - args.items.push(CallArg::Pos(body)); - } - - Some(Expr::Call(Box::new(CallExpr { - span: p.span_from(callee.span().start), - callee, - args, - }))) + }) } /// Parse the arguments to a function call. -fn args(p: &mut Parser) -> CallArgs { - p.start_group(Group::Paren, TokenMode::Code); - let items = collection(p).0; - let span = p.end_group(); - CallArgs { span, items } +fn args(p: &mut Parser, allow_template: bool) { + p.perform(NodeKind::CallArgs, |p| { + if !allow_template || p.peek_direct() == Some(&NodeKind::LeftParen) { + p.start_group(Group::Paren); + collection(p); + p.end_group(); + } + + while allow_template && p.peek_direct() == Some(&NodeKind::LeftBracket) { + template(p); + } + }) } /// Parse a with expression. -fn with_expr(p: &mut Parser, callee: Expr) -> Option<Expr> { - if p.peek() == Some(Token::LeftParen) { - Some(Expr::With(Box::new(WithExpr { - span: p.span_from(callee.span().start), - callee, - args: args(p), - }))) - } else { - p.expected("argument list"); - None - } +fn with_expr(p: &mut Parser, marker: Marker) -> ParseResult { + marker.perform(p, NodeKind::WithExpr, |p| { + p.eat_assert(&NodeKind::With); + + if p.at(&NodeKind::LeftParen) { + args(p, false); + Ok(()) + } else { + p.expected("argument list"); + Err(()) + } + }) } /// Parse a let expression. -fn let_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::Let); +fn let_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::LetExpr, |p| { + p.eat_assert(&NodeKind::Let); - let mut output = None; - if let Some(binding) = ident(p) { - let mut init = None; + let marker = p.marker(); + ident(p)?; - if p.eat_if(Token::With) { - init = with_expr(p, Expr::Ident(Box::new(binding.clone()))); + if p.at(&NodeKind::With) { + with_expr(p, marker)?; } else { // If a parenthesis follows, this is a function definition. - let mut maybe_params = None; - if p.peek_direct() == Some(Token::LeftParen) { - p.start_group(Group::Paren, TokenMode::Code); - let items = collection(p).0; - maybe_params = Some(params(p, items)); + let has_params = p.peek_direct() == Some(&NodeKind::LeftParen); + if has_params { + let marker = p.marker(); + p.start_group(Group::Paren); + collection(p); p.end_group(); + params(p, marker); } - if p.eat_if(Token::Eq) { - init = expr(p); - } else if maybe_params.is_some() { + if p.eat_if(&NodeKind::Eq) { + expr(p)?; + } else if has_params { // Function definitions must have a body. - p.expected_at(p.prev_end(), "body"); + p.expected_at("body"); } // Rewrite into a closure expression if it's a function definition. - if let Some(params) = maybe_params { - let body = init?; - init = Some(Expr::Closure(Box::new(ClosureExpr { - span: binding.span.join(body.span()), - name: Some(binding.clone()), - params, - body: Rc::new(body), - }))); + if has_params { + marker.end(p, NodeKind::Closure); } } - output = Some(Expr::Let(Box::new(LetExpr { - span: p.span_from(start), - binding, - init, - }))); - } - - output + Ok(()) + }) } /// Parse an if expresion. -fn if_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::If); - - let mut output = None; - if let Some(condition) = expr(p) { - if let Some(if_body) = body(p) { - let mut else_body = None; - if p.eat_if(Token::Else) { - if p.peek() == Some(Token::If) { - else_body = if_expr(p); - } else { - else_body = body(p); - } - } +fn if_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::IfExpr, |p| { + p.eat_assert(&NodeKind::If); + + expr(p)?; + body(p)?; - output = Some(Expr::If(Box::new(IfExpr { - span: p.span_from(start), - condition, - if_body, - else_body, - }))); + if p.eat_if(&NodeKind::Else) { + if p.at(&NodeKind::If) { + if_expr(p)?; + } else { + body(p)?; + } } - } - output + Ok(()) + }) } /// Parse a while expresion. -fn while_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::While); - - let mut output = None; - if let Some(condition) = expr(p) { - if let Some(body) = body(p) { - output = Some(Expr::While(Box::new(WhileExpr { - span: p.span_from(start), - condition, - body, - }))); - } - } - - output +fn while_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::WhileExpr, |p| { + p.eat_assert(&NodeKind::While); + expr(p)?; + body(p)?; + Ok(()) + }) } /// Parse a for expression. -fn for_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::For); - - let mut output = None; - if let Some(pattern) = for_pattern(p) { - if p.eat_expect(Token::In) { - if let Some(iter) = expr(p) { - if let Some(body) = body(p) { - output = Some(Expr::For(Box::new(ForExpr { - span: p.span_from(start), - pattern, - iter, - body, - }))); - } - } - } - } - - output +fn for_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ForExpr, |p| { + p.eat_assert(&NodeKind::For); + for_pattern(p)?; + p.eat_expect(&NodeKind::In)?; + expr(p)?; + body(p)?; + Ok(()) + }) } /// Parse a for loop pattern. -fn for_pattern(p: &mut Parser) -> Option<ForPattern> { - let first = ident(p)?; - if p.eat_if(Token::Comma) { - if let Some(second) = ident(p) { - return Some(ForPattern::KeyValue(first, second)); +fn for_pattern(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ForPattern, |p| { + ident(p)?; + if p.eat_if(&NodeKind::Comma) { + ident(p)?; } - } - Some(ForPattern::Value(first)) + Ok(()) + }) } /// Parse an import expression. -fn import_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::Import); +fn import_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ImportExpr, |p| { + p.eat_assert(&NodeKind::Import); + + if !p.eat_if(&NodeKind::Star) { + // This is the list of identifiers scenario. + p.perform(NodeKind::ImportItems, |p| { + p.start_group(Group::Imports); + let marker = p.marker(); + let items = collection(p).1; + if items == 0 { + p.expected_at("import items"); + } + p.end_group(); - let imports = if p.eat_if(Token::Star) { - // This is the wildcard scenario. - Imports::Wildcard - } else { - // This is the list of identifiers scenario. - p.start_group(Group::Imports, TokenMode::Code); - let items = collection(p).0; - if items.is_empty() { - p.expected_at(p.prev_end(), "import items"); - } - p.end_group(); - Imports::Idents(idents(p, items)) - }; + marker.filter_children(p, |n| match n.kind() { + NodeKind::Ident(_) | NodeKind::Comma => Ok(()), + _ => Err("expected identifier"), + }); + }); + }; - let mut output = None; - if p.eat_expect(Token::From) { - if let Some(path) = expr(p) { - output = Some(Expr::Import(Box::new(ImportExpr { - span: p.span_from(start), - imports, - path, - }))); - } - } + p.eat_expect(&NodeKind::From)?; + expr(p)?; - output + Ok(()) + }) } /// Parse an include expression. -fn include_expr(p: &mut Parser) -> Option<Expr> { - let start = p.next_start(); - p.eat_assert(Token::Include); - - expr(p).map(|path| { - Expr::Include(Box::new(IncludeExpr { span: p.span_from(start), path })) +fn include_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::IncludeExpr, |p| { + p.eat_assert(&NodeKind::Include); + expr(p)?; + Ok(()) }) } /// Parse an identifier. -fn ident(p: &mut Parser) -> Option<Ident> { - if let Some(Token::Ident(string)) = p.peek() { - Some(Ident { - span: p.eat_span(), - string: string.into(), - }) - } else { - p.expected("identifier"); - None +fn ident(p: &mut Parser) -> ParseResult { + match p.peek() { + Some(NodeKind::Ident(_)) => { + p.eat(); + Ok(()) + } + _ => { + p.expected("identifier"); + Err(()) + } } } /// Parse a control flow body. -fn body(p: &mut Parser) -> Option<Expr> { +fn body(p: &mut Parser) -> ParseResult { match p.peek() { - Some(Token::LeftBracket) => Some(template(p)), - Some(Token::LeftBrace) => Some(block(p)), + Some(NodeKind::LeftBracket) => template(p), + Some(NodeKind::LeftBrace) => block(p), _ => { - p.expected_at(p.prev_end(), "body"); - None + p.expected_at("body"); + return Err(()); } } + Ok(()) } diff --git a/src/parse/parser.rs b/src/parse/parser.rs index 347d6f71..1c4c2a5c 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -1,250 +1,216 @@ -use std::ops::Range; +use std::mem; use super::{TokenMode, Tokens}; -use crate::diag::Error; -use crate::source::{SourceFile, SourceId}; -use crate::syntax::{IntoSpan, Pos, Span, Token}; +use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind}; +use crate::util::EcoString; + +/// Allows parser methods to use the try operator. Not exposed as the parser +/// recovers from all errors. +pub(crate) type ParseResult<T = ()> = Result<T, ()>; /// A convenient token-based parser. pub struct Parser<'s> { - /// The parsed file. - source: &'s SourceFile, - /// Parsing errors. - errors: Vec<Error>, /// An iterator over the source tokens. tokens: Tokens<'s>, + /// Whether we are at the end of the file or of a group. + eof: bool, + /// The current token. + current: Option<NodeKind>, + /// The end byte index of the last non-trivia token. + prev_end: usize, + /// The start byte index of the peeked token. + current_start: usize, /// The stack of open groups. groups: Vec<GroupEntry>, - /// The next token. - next: Option<Token<'s>>, - /// The peeked token. - /// (Same as `next` except if we are at the end of group, then `None`). - peeked: Option<Token<'s>>, - /// The end index of the last (non-whitespace if in code mode) token. - prev_end: usize, - /// The start index of the peeked token. - next_start: usize, -} - -/// A logical group of tokens, e.g. `[...]`. -struct GroupEntry { - /// The kind of group this is. This decides which tokens will end the group. - /// For example, a [`Group::Paren`] will be ended by - /// [`Token::RightParen`]. - pub kind: Group, - /// The start index of the group. Used by `Parser::end_group` to return the - /// group's full span. - pub start: usize, - /// The mode the parser was in _before_ the group started (to which we go - /// back once the group ends). - pub prev_mode: TokenMode, -} - -/// A group, confined by optional start and end delimiters. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum Group { - /// A parenthesized group: `(...)`. - Paren, - /// A bracketed group: `[...]`. - Bracket, - /// A curly-braced group: `{...}`. - Brace, - /// A group ended by a semicolon or a line break: `;`, `\n`. - Stmt, - /// A group for a single expression, ended by a line break. - Expr, - /// A group for import items, ended by a semicolon, line break or `from`. - Imports, + /// The children of the currently built node. + children: Vec<Green>, } impl<'s> Parser<'s> { /// Create a new parser for the source string. - pub fn new(source: &'s SourceFile) -> Self { - let mut tokens = Tokens::new(source.src(), TokenMode::Markup); - let next = tokens.next(); + pub fn new(src: &'s str) -> Self { + let mut tokens = Tokens::new(src, TokenMode::Markup); + let current = tokens.next(); Self { - source, - errors: vec![], tokens, - groups: vec![], - next, - peeked: next, + eof: current.is_none(), + current, prev_end: 0, - next_start: 0, + current_start: 0, + groups: vec![], + children: vec![], } } - /// Finish parsing and return all errors. - pub fn finish(self) -> Vec<Error> { - self.errors + /// End the parsing process and return the last child. + pub fn finish(self) -> Vec<Green> { + self.children } - /// The id of the parsed source file. - pub fn id(&self) -> SourceId { - self.source.id() + /// Create a new marker. + pub fn marker(&mut self) -> Marker { + Marker(self.children.len()) } - /// Whether the end of the source string or group is reached. - pub fn eof(&self) -> bool { - self.peek().is_none() + /// Create a markup right before the trailing trivia. + pub fn trivia_start(&self) -> Marker { + let count = self + .children + .iter() + .rev() + .take_while(|node| self.is_trivia(node.kind())) + .count(); + Marker(self.children.len() - count) } - /// Consume the next token. - pub fn eat(&mut self) -> Option<Token<'s>> { - let token = self.peek()?; - self.bump(); - Some(token) + /// Perform a subparse that wraps its result in a node with the given kind. + pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T + where + F: FnOnce(&mut Self) -> T, + { + let prev = mem::take(&mut self.children); + let output = f(self); + let until = self.trivia_start(); + let mut children = mem::replace(&mut self.children, prev); + + if self.tokens.mode() == TokenMode::Code { + // Trailing trivia should not be wrapped into the new node. + let idx = self.children.len(); + self.children.push(Green::default()); + self.children.extend(children.drain(until.0 ..)); + self.children[idx] = GreenNode::with_children(kind, children).into(); + } else { + self.children.push(GreenNode::with_children(kind, children).into()); + } + + output } - /// Eat the next token and return its source range. - pub fn eat_span(&mut self) -> Span { - let start = self.next_start(); - self.eat(); - Span::new(self.id(), start, self.prev_end()) + /// Whether the end of the source string or group is reached. + pub fn eof(&self) -> bool { + self.eof } - /// Consume the next token if it is the given one. - pub fn eat_if(&mut self, t: Token) -> bool { - if self.peek() == Some(t) { - self.bump(); - true - } else { - false + /// Consume the current token and also trailing trivia. + pub fn eat(&mut self) { + self.prev_end = self.tokens.index(); + self.bump(); + + if self.tokens.mode() == TokenMode::Code { + // Skip whitespace and comments. + while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) { + self.bump(); + } } + + self.repeek(); } - /// Consume the next token if the closure maps it a to `Some`-variant. - pub fn eat_map<T, F>(&mut self, f: F) -> Option<T> - where - F: FnOnce(Token<'s>) -> Option<T>, - { - let token = self.peek()?; - let mapped = f(token); - if mapped.is_some() { - self.bump(); + /// Eat if the current token it is the given one. + pub fn eat_if(&mut self, t: &NodeKind) -> bool { + let at = self.at(t); + if at { + self.eat(); } - mapped + at } - /// Consume the next token if it is the given one and produce an error if - /// not. - pub fn eat_expect(&mut self, t: Token) -> bool { + /// Eat if the current token is the given one and produce an error if not. + pub fn eat_expect(&mut self, t: &NodeKind) -> ParseResult { let eaten = self.eat_if(t); if !eaten { - self.expected_at(self.prev_end(), t.name()); + self.expected_at(t.as_str()); } - eaten + if eaten { Ok(()) } else { Err(()) } } - /// Consume the next token, debug-asserting that it is one of the given ones. - pub fn eat_assert(&mut self, t: Token) { - let next = self.eat(); - debug_assert_eq!(next, Some(t)); + /// Eat, debug-asserting that the token is the given one. + pub fn eat_assert(&mut self, t: &NodeKind) { + debug_assert_eq!(self.peek(), Some(t)); + self.eat(); } - /// Consume tokens while the condition is true. + /// Eat tokens while the condition is true. pub fn eat_while<F>(&mut self, mut f: F) where - F: FnMut(Token<'s>) -> bool, + F: FnMut(&NodeKind) -> bool, { while self.peek().map_or(false, |t| f(t)) { self.eat(); } } - /// Peek at the next token without consuming it. - pub fn peek(&self) -> Option<Token<'s>> { - self.peeked + /// Eat the current token, but change its type. + pub fn convert(&mut self, kind: NodeKind) { + let marker = self.marker(); + self.eat(); + marker.convert(self, kind); } - /// Peek at the next token if it follows immediately after the last one - /// without any whitespace in between. - pub fn peek_direct(&self) -> Option<Token<'s>> { - if self.next_start() == self.prev_end() { - self.peeked - } else { - None - } + /// Whether the current token is of the given type. + pub fn at(&self, kind: &NodeKind) -> bool { + self.peek() == Some(kind) } - /// Peek at the span of the next token. - /// - /// Has length zero if `peek()` returns `None`. - pub fn peek_span(&self) -> Span { - Span::new(self.id(), self.next_start(), self.next_end()) + /// Peek at the current token without consuming it. + pub fn peek(&self) -> Option<&NodeKind> { + if self.eof { None } else { self.current.as_ref() } } - /// Peek at the source of the next token. - pub fn peek_src(&self) -> &'s str { - self.get(self.next_start() .. self.next_end()) + /// Peek at the current token, if it follows immediately after the last one + /// without any trivia in between. + pub fn peek_direct(&self) -> Option<&NodeKind> { + if self.prev_end() == self.current_start() { + self.peek() + } else { + None + } } - /// Checks whether the next token fulfills a condition. - /// - /// Returns `false` if there is no next token. - pub fn check<F>(&self, f: F) -> bool - where - F: FnOnce(Token<'s>) -> bool, - { - self.peek().map_or(false, f) + /// Peek at the source of the current token. + pub fn peek_src(&self) -> &'s str { + self.tokens.scanner().get(self.current_start() .. self.current_end()) } - /// The byte index at which the last token ended. - /// - /// Refers to the end of the last _non-whitespace_ token in code mode. + /// The byte index at which the last non-trivia token ended. pub fn prev_end(&self) -> usize { self.prev_end } - /// The byte index at which the next token starts. - pub fn next_start(&self) -> usize { - self.next_start + /// The byte index at which the current token starts. + pub fn current_start(&self) -> usize { + self.current_start } - /// The byte index at which the next token will end. - /// - /// Is the same as [`next_start()`][Self::next_start] if `peek()` returns - /// `None`. - pub fn next_end(&self) -> usize { + /// The byte index at which the current token ends. + pub fn current_end(&self) -> usize { self.tokens.index() } /// Determine the column index for the given byte index. pub fn column(&self, index: usize) -> usize { - self.source.byte_to_column(index).unwrap() - } - - /// Slice out part of the source string. - pub fn get(&self, range: Range<usize>) -> &'s str { - self.source.get(range).unwrap() - } - - /// The span from `start` to [`self.prev_end()`](Self::prev_end). - pub fn span_from(&self, start: impl Into<Pos>) -> Span { - Span::new(self.id(), start, self.prev_end()) + self.tokens.scanner().column(index) } /// Continue parsing in a group. /// /// When the end delimiter of the group is reached, all subsequent calls to - /// `eat()` and `peek()` return `None`. Parsing can only continue with - /// a matching call to `end_group`. + /// `peek()` return `None`. Parsing can only continue with a matching call + /// to `end_group`. /// - /// This panics if the next token does not start the given group. - pub fn start_group(&mut self, kind: Group, mode: TokenMode) { - self.groups.push(GroupEntry { - kind, - start: self.next_start(), - prev_mode: self.tokens.mode(), + /// This panics if the current token does not start the given group. + pub fn start_group(&mut self, kind: Group) { + self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() }); + self.tokens.set_mode(match kind { + Group::Bracket => TokenMode::Markup, + _ => TokenMode::Code, }); - self.tokens.set_mode(mode); self.repeek(); - match kind { - Group::Paren => self.eat_assert(Token::LeftParen), - Group::Bracket => self.eat_assert(Token::LeftBracket), - Group::Brace => self.eat_assert(Token::LeftBrace), + Group::Paren => self.eat_assert(&NodeKind::LeftParen), + Group::Bracket => self.eat_assert(&NodeKind::LeftBracket), + Group::Brace => self.eat_assert(&NodeKind::LeftBrace), Group::Stmt => {} Group::Expr => {} Group::Imports => {} @@ -254,130 +220,228 @@ impl<'s> Parser<'s> { /// End the parsing of a group. /// /// This panics if no group was started. - pub fn end_group(&mut self) -> Span { - let prev_mode = self.tokens.mode(); + pub fn end_group(&mut self) { + let group_mode = self.tokens.mode(); let group = self.groups.pop().expect("no started group"); self.tokens.set_mode(group.prev_mode); self.repeek(); - let mut rescan = self.tokens.mode() != prev_mode; + let mut rescan = self.tokens.mode() != group_mode; // Eat the end delimiter if there is one. if let Some((end, required)) = match group.kind { - Group::Paren => Some((Token::RightParen, true)), - Group::Bracket => Some((Token::RightBracket, true)), - Group::Brace => Some((Token::RightBrace, true)), - Group::Stmt => Some((Token::Semicolon, false)), + Group::Paren => Some((NodeKind::RightParen, true)), + Group::Bracket => Some((NodeKind::RightBracket, true)), + Group::Brace => Some((NodeKind::RightBrace, true)), + Group::Stmt => Some((NodeKind::Semicolon, false)), Group::Expr => None, Group::Imports => None, } { - if self.next == Some(end) { + if self.current.as_ref() == Some(&end) { // Bump the delimeter and return. No need to rescan in this case. - self.bump(); + self.eat(); rescan = false; } else if required { - self.error( - self.next_start() .. self.next_start(), - format!("expected {}", end.name()), - ); + self.push_error(format!("expected {}", end)); } } // Rescan the peeked token if the mode changed. if rescan { + if group_mode == TokenMode::Code { + self.children.truncate(self.trivia_start().0); + } + self.tokens.jump(self.prev_end()); - self.bump(); + self.prev_end = self.tokens.index(); + self.current_start = self.tokens.index(); + self.current = self.tokens.next(); + self.repeek(); } + } - Span::new(self.id(), group.start, self.prev_end()) + /// Low-level bump that consumes exactly one token without special trivia + /// handling. + fn bump(&mut self) { + let kind = self.current.take().unwrap(); + let len = self.tokens.index() - self.current_start; + self.children.push(GreenData::new(kind, len).into()); + self.current_start = self.tokens.index(); + self.current = self.tokens.next(); } - /// Add an error with location and message. - pub fn error(&mut self, span: impl IntoSpan, message: impl Into<String>) { - self.errors.push(Error::new(span.into_span(self.id()), message)); + /// Take another look at the current token to recheck whether it ends a + /// group. + fn repeek(&mut self) { + self.eof = match &self.current { + Some(NodeKind::RightParen) => self.inside(Group::Paren), + Some(NodeKind::RightBracket) => self.inside(Group::Bracket), + Some(NodeKind::RightBrace) => self.inside(Group::Brace), + Some(NodeKind::Semicolon) => self.inside(Group::Stmt), + Some(NodeKind::From) => self.inside(Group::Imports), + Some(NodeKind::Space(n)) => *n >= 1 && self.stop_at_newline(), + Some(_) => false, + None => true, + }; } - /// Add an error that `what` was expected at the given span. - pub fn expected_at(&mut self, span: impl IntoSpan, what: &str) { - self.error(span, format!("expected {}", what)); + /// Returns whether the given type can be skipped over. + fn is_trivia(&self, token: &NodeKind) -> bool { + Self::is_trivia_ext(token, self.stop_at_newline()) } - /// Eat the next token and add an error that it is not the expected `thing`. - pub fn expected(&mut self, what: &str) { - let before = self.next_start(); - if let Some(found) = self.eat() { - let after = self.prev_end(); - self.error( - before .. after, - format!("expected {}, found {}", what, found.name()), - ); - } else { - self.expected_at(self.next_start(), what); + /// Returns whether the given type can be skipped over given the current + /// newline mode. + fn is_trivia_ext(token: &NodeKind, stop_at_newline: bool) -> bool { + match token { + NodeKind::Space(n) => *n == 0 || !stop_at_newline, + NodeKind::LineComment => true, + NodeKind::BlockComment => true, + _ => false, } } - /// Eat the next token and add an error that it is unexpected. + /// Whether the active group must end at a newline. + fn stop_at_newline(&self) -> bool { + matches!( + self.groups.last().map(|group| group.kind), + Some(Group::Stmt | Group::Expr | Group::Imports) + ) + } + + /// Whether we are inside the given group. + fn inside(&self, kind: Group) -> bool { + self.groups.iter().any(|g| g.kind == kind) + } +} + +/// Error handling. +impl Parser<'_> { + /// Push an error into the children list. + pub fn push_error(&mut self, msg: impl Into<EcoString>) { + let error = NodeKind::Error(ErrorPos::Full, msg.into()); + self.children.push(GreenData::new(error, 0).into()); + } + + /// Eat the current token and add an error that it is unexpected. pub fn unexpected(&mut self) { - let before = self.next_start(); - if let Some(found) = self.eat() { - let after = self.prev_end(); - self.error(before .. after, format!("unexpected {}", found.name())); + match self.peek() { + Some(found) => { + let msg = format!("unexpected {}", found); + let error = NodeKind::Error(ErrorPos::Full, msg.into()); + self.perform(error, Self::eat); + } + None => self.push_error("unexpected end of file"), } } - /// Move to the next token. - fn bump(&mut self) { - self.prev_end = self.tokens.index().into(); - self.next_start = self.tokens.index().into(); - self.next = self.tokens.next(); - - if self.tokens.mode() == TokenMode::Code { - // Skip whitespace and comments. - while match self.next { - Some(Token::Space(n)) => n < 1 || !self.stop_at_newline(), - Some(Token::LineComment(_)) => true, - Some(Token::BlockComment(_)) => true, - _ => false, - } { - self.next_start = self.tokens.index().into(); - self.next = self.tokens.next(); + /// Eat the current token and add an error that it is not the expected `thing`. + pub fn expected(&mut self, thing: &str) { + match self.peek() { + Some(found) => { + let msg = format!("expected {}, found {}", thing, found); + let error = NodeKind::Error(ErrorPos::Full, msg.into()); + self.perform(error, Self::eat); } + None => self.expected_at(thing), } + } - self.repeek(); + /// Add an error that the `thing` was expected at the end of the last + /// non-trivia token. + pub fn expected_at(&mut self, thing: &str) { + self.trivia_start().expected(self, thing); } +} - /// Take another look at the next token to recheck whether it ends a group. - fn repeek(&mut self) { - self.peeked = self.next; - let token = match self.next { - Some(token) => token, - None => return, - }; +/// A marker that indicates where a node may start. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Marker(usize); - if match token { - Token::RightParen => self.inside(Group::Paren), - Token::RightBracket => self.inside(Group::Bracket), - Token::RightBrace => self.inside(Group::Brace), - Token::Semicolon => self.inside(Group::Stmt), - Token::From => self.inside(Group::Imports), - Token::Space(n) => n >= 1 && self.stop_at_newline(), - _ => false, - } { - self.peeked = None; +impl Marker { + /// Perform a subparse that wraps all children after the marker in a node + /// with the given kind. + pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T + where + F: FnOnce(&mut Parser) -> T, + { + let success = f(p); + self.end(p, kind); + success + } + + /// Wrap all children after the marker (excluding trailing trivia) in a node + /// with the given `kind`. + pub fn end(self, p: &mut Parser, kind: NodeKind) { + let until = p.trivia_start(); + let children = p.children.drain(self.0 .. until.0).collect(); + p.children + .insert(self.0, GreenNode::with_children(kind, children).into()); + } + + /// Wrap all children that do not fulfill the predicate in error nodes. + pub fn filter_children<F>(self, p: &mut Parser, f: F) + where + F: Fn(&Green) -> Result<(), &'static str>, + { + for child in &mut p.children[self.0 ..] { + if (p.tokens.mode() == TokenMode::Markup + || !Parser::is_trivia_ext(child.kind(), false)) + && !child.kind().is_error() + { + if let Err(msg) = f(child) { + let error = NodeKind::Error(ErrorPos::Full, msg.into()); + let inner = mem::take(child); + *child = GreenNode::with_child(error, inner).into(); + } + } } } - /// Whether the active group ends at a newline. - fn stop_at_newline(&self) -> bool { - matches!( - self.groups.last().map(|group| group.kind), - Some(Group::Stmt | Group::Expr | Group::Imports) - ) + /// Insert an error message that `what` was expected at the marker position. + pub fn expected(self, p: &mut Parser, what: &str) { + let msg = format!("expected {}", what); + let error = NodeKind::Error(ErrorPos::Full, msg.into()); + p.children.insert(self.0, GreenData::new(error, 0).into()); } - /// Whether we are inside the given group. - fn inside(&self, kind: Group) -> bool { - self.groups.iter().any(|g| g.kind == kind) + /// Peek at the child directly after the marker. + pub fn peek<'a>(self, p: &'a Parser) -> Option<&'a Green> { + p.children.get(self.0) } + + /// Convert the child directly after marker. + pub fn convert(self, p: &mut Parser, kind: NodeKind) { + if let Some(child) = p.children.get_mut(self.0) { + child.convert(kind); + } + } +} + +/// A logical group of tokens, e.g. `[...]`. +struct GroupEntry { + /// The kind of group this is. This decides which tokens will end the group. + /// For example, a [`Group::Paren`] will be ended by + /// [`Token::RightParen`]. + pub kind: Group, + /// The mode the parser was in _before_ the group started (to which we go + /// back once the group ends). + pub prev_mode: TokenMode, +} + +/// A group, confined by optional start and end delimiters. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Group { + /// A bracketed group: `[...]`. + Bracket, + /// A curly-braced group: `{...}`. + Brace, + /// A parenthesized group: `(...)`. + Paren, + /// A group ended by a semicolon or a line break: `;`, `\n`. + Stmt, + /// A group for a single expression, ended by a line break. + Expr, + /// A group for import items, ended by a semicolon, line break or `from`. + Imports, } diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index 1b323847..e15ae339 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -1,5 +1,5 @@ -use super::{is_newline, Scanner}; -use crate::syntax::{Ident, RawNode, Span}; +use super::{is_ident, is_newline, Scanner}; +use crate::syntax::ast::RawNode; use crate::util::EcoString; /// Resolve all escape sequences in a string. @@ -25,11 +25,9 @@ pub fn resolve_string(string: &str) -> EcoString { let sequence = s.eat_while(|c| c.is_ascii_hexdigit()); let _terminated = s.eat_if('}'); - if let Some(c) = resolve_hex(sequence) { - out.push(c); - } else { - // TODO: Feedback that unicode escape sequence is wrong. - out.push_str(s.eaten_from(start)); + match resolve_hex(sequence) { + Some(c) => out.push(c), + None => out.push_str(s.eaten_from(start)), } } @@ -48,19 +46,17 @@ pub fn resolve_hex(sequence: &str) -> Option<char> { } /// Resolve the language tag and trims the raw text. -pub fn resolve_raw(span: Span, column: usize, backticks: usize, text: &str) -> RawNode { +pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawNode { if backticks > 1 { let (tag, inner) = split_at_lang_tag(text); let (text, block) = trim_and_split_raw(column, inner); RawNode { - span, - lang: Ident::new(tag, span.with_end(span.start + tag.len())), + lang: is_ident(tag).then(|| tag.into()), text: text.into(), block, } } else { RawNode { - span, lang: None, text: split_lines(text).join("\n").into(), block: false, @@ -140,7 +136,6 @@ fn split_lines(text: &str) -> Vec<&str> { #[cfg(test)] #[rustfmt::skip] mod tests { - use crate::syntax::Span; use super::*; #[test] @@ -190,7 +185,7 @@ mod tests { text: &str, block: bool, ) { - let node = resolve_raw(Span::detached(), column, backticks, raw); + let node = resolve_raw(column, backticks, raw); assert_eq!(node.lang.as_deref(), lang); assert_eq!(node.text, text); assert_eq!(node.block, block); diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index 8e3e4278..ea06a2e0 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -1,5 +1,7 @@ use std::slice::SliceIndex; +use unicode_xid::UnicodeXID; + /// A featureful char-based scanner. #[derive(Copy, Clone)] pub struct Scanner<'s> { @@ -114,6 +116,12 @@ impl<'s> Scanner<'s> { self.index = index; } + /// The full source string. + #[inline] + pub fn src(&self) -> &'s str { + &self.src + } + /// Slice out part of the source string. #[inline] pub fn get<I>(&self, index: I) -> &'s str @@ -150,6 +158,16 @@ impl<'s> Scanner<'s> { // optimized away in some cases. self.src.get(start .. self.index).unwrap_or_default() } + + /// The column index of a given index in the source string. + #[inline] + pub fn column(&self, index: usize) -> usize { + self.src[.. index] + .chars() + .rev() + .take_while(|&c| !is_newline(c)) + .count() + } } /// Whether this character denotes a newline. @@ -163,3 +181,30 @@ pub fn is_newline(character: char) -> bool { '\u{0085}' | '\u{2028}' | '\u{2029}' ) } + +/// Whether a string is a valid unicode identifier. +/// +/// In addition to what is specified in the [Unicode Standard][uax31], we allow: +/// - `_` as a starting character, +/// - `_` and `-` as continuing characters. +/// +/// [uax31]: http://www.unicode.org/reports/tr31/ +#[inline] +pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + chars + .next() + .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) +} + +/// Whether a character can start an identifier. +#[inline] +pub fn is_id_start(c: char) -> bool { + c.is_xid_start() || c == '_' +} + +/// Whether a character can continue an identifier. +#[inline] +pub fn is_id_continue(c: char) -> bool { + c.is_xid_continue() || c == '_' || c == '-' +} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 5f969452..96dfd9d1 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,6 +1,13 @@ -use super::{is_newline, Scanner}; +use std::rc::Rc; + +use super::{ + is_id_continue, is_id_start, is_newline, resolve_hex, resolve_raw, resolve_string, + Scanner, +}; use crate::geom::{AngularUnit, LengthUnit}; -use crate::syntax::*; +use crate::syntax::ast::{MathNode, RawNode}; +use crate::syntax::{ErrorPos, NodeKind}; +use crate::util::EcoString; /// An iterator over the tokens of a string of source code. pub struct Tokens<'s> { @@ -59,7 +66,7 @@ impl<'s> Tokens<'s> { } impl<'s> Iterator for Tokens<'s> { - type Item = Token<'s>; + type Item = NodeKind; /// Parse the next token in the source code. #[inline] @@ -68,19 +75,21 @@ impl<'s> Iterator for Tokens<'s> { let c = self.s.eat()?; Some(match c { // Blocks and templates. - '[' => Token::LeftBracket, - ']' => Token::RightBracket, - '{' => Token::LeftBrace, - '}' => Token::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, // Whitespace. - ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => Token::Space(0), + ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => NodeKind::Space(0), c if c.is_whitespace() => self.whitespace(), // Comments with special case for URLs. '/' if self.s.eat_if('*') => self.block_comment(), '/' if !self.maybe_in_url() && self.s.eat_if('/') => self.line_comment(), - '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)), + '*' if self.s.eat_if('/') => { + NodeKind::Unknown(self.s.eaten_from(start).into()) + } // Other things. _ => match self.mode { @@ -93,7 +102,7 @@ impl<'s> Iterator for Tokens<'s> { impl<'s> Tokens<'s> { #[inline] - fn markup(&mut self, start: usize, c: char) -> Token<'s> { + fn markup(&mut self, start: usize, c: char) -> NodeKind { match c { // Escape sequences. '\\' => self.backslash(), @@ -102,13 +111,15 @@ impl<'s> Tokens<'s> { '#' => self.hash(), // Markup. - '~' => Token::Tilde, - '*' => Token::Star, - '_' => Token::Underscore, + '~' => NodeKind::NonBreakingSpace, + '*' => NodeKind::Strong, + '_' => NodeKind::Emph, '`' => self.raw(), '$' => self.math(), - '-' => self.hyph(start), - '=' if self.s.check_or(true, |c| c == '=' || c.is_whitespace()) => Token::Eq, + '-' => self.hyph(), + '=' if self.s.check_or(true, |c| c == '=' || c.is_whitespace()) => { + NodeKind::Eq + } c if c == '.' || c.is_ascii_digit() => self.numbering(start, c), // Plain text. @@ -116,35 +127,35 @@ impl<'s> Tokens<'s> { } } - fn code(&mut self, start: usize, c: char) -> Token<'s> { + fn code(&mut self, start: usize, c: char) -> NodeKind { match c { // Parens. - '(' => Token::LeftParen, - ')' => Token::RightParen, + '(' => NodeKind::LeftParen, + ')' => NodeKind::RightParen, // Length two. - '=' if self.s.eat_if('=') => Token::EqEq, - '!' if self.s.eat_if('=') => Token::ExclEq, - '<' if self.s.eat_if('=') => Token::LtEq, - '>' if self.s.eat_if('=') => Token::GtEq, - '+' if self.s.eat_if('=') => Token::PlusEq, - '-' if self.s.eat_if('=') => Token::HyphEq, - '*' if self.s.eat_if('=') => Token::StarEq, - '/' if self.s.eat_if('=') => Token::SlashEq, - '.' if self.s.eat_if('.') => Token::Dots, - '=' if self.s.eat_if('>') => Token::Arrow, + '=' if self.s.eat_if('=') => NodeKind::EqEq, + '!' if self.s.eat_if('=') => NodeKind::ExclEq, + '<' if self.s.eat_if('=') => NodeKind::LtEq, + '>' if self.s.eat_if('=') => NodeKind::GtEq, + '+' if self.s.eat_if('=') => NodeKind::PlusEq, + '-' if self.s.eat_if('=') => NodeKind::HyphEq, + '*' if self.s.eat_if('=') => NodeKind::StarEq, + '/' if self.s.eat_if('=') => NodeKind::SlashEq, + '.' if self.s.eat_if('.') => NodeKind::Dots, + '=' if self.s.eat_if('>') => NodeKind::Arrow, // Length one. - ',' => Token::Comma, - ';' => Token::Semicolon, - ':' => Token::Colon, - '+' => Token::Plus, - '-' => Token::Hyph, - '*' => Token::Star, - '/' => Token::Slash, - '=' => Token::Eq, - '<' => Token::Lt, - '>' => Token::Gt, + ',' => NodeKind::Comma, + ';' => NodeKind::Semicolon, + ':' => NodeKind::Colon, + '+' => NodeKind::Plus, + '-' => NodeKind::Minus, + '*' => NodeKind::Star, + '/' => NodeKind::Slash, + '=' => NodeKind::Eq, + '<' => NodeKind::Lt, + '>' => NodeKind::Gt, // Identifiers. c if is_id_start(c) => self.ident(start), @@ -159,12 +170,12 @@ impl<'s> Tokens<'s> { // Strings. '"' => self.string(), - _ => Token::Invalid(self.s.eaten_from(start)), + _ => NodeKind::Unknown(self.s.eaten_from(start).into()), } } #[inline] - fn text(&mut self, start: usize) -> Token<'s> { + fn text(&mut self, start: usize) -> NodeKind { macro_rules! table { ($($c:literal)|*) => {{ let mut t = [false; 128]; @@ -186,10 +197,10 @@ impl<'s> Tokens<'s> { TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) }); - Token::Text(self.s.eaten_from(start)) + NodeKind::Text(self.s.eaten_from(start).into()) } - fn whitespace(&mut self) -> Token<'s> { + fn whitespace(&mut self) -> NodeKind { self.s.uneat(); // Count the number of newlines. @@ -208,73 +219,81 @@ impl<'s> Tokens<'s> { } } - Token::Space(newlines) + NodeKind::Space(newlines) } - fn backslash(&mut self) -> Token<'s> { - if let Some(c) = self.s.peek() { - match c { + fn backslash(&mut self) -> NodeKind { + match self.s.peek() { + Some(c) => match c { // Backslash and comments. '\\' | '/' | // Parenthesis and hashtag. '[' | ']' | '{' | '}' | '#' | // Markup. '*' | '_' | '=' | '~' | '`' | '$' => { - let start = self.s.index(); self.s.eat_assert(c); - Token::Text(&self.s.eaten_from(start)) + NodeKind::Text(c.into()) } 'u' if self.s.rest().starts_with("u{") => { self.s.eat_assert('u'); self.s.eat_assert('{'); - Token::UnicodeEscape(UnicodeEscapeToken { - // Allow more than `ascii_hexdigit` for better error recovery. - sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()), - terminated: self.s.eat_if('}'), - }) + let sequence = self.s.eat_while(|c| c.is_ascii_alphanumeric()); + if self.s.eat_if('}') { + if let Some(c) = resolve_hex(&sequence) { + NodeKind::UnicodeEscape(c) + } else { + NodeKind::Error( + ErrorPos::Full, + "invalid unicode escape sequence".into(), + ) + } + } else { + NodeKind::Error( + ErrorPos::End, + "expected closing brace".into(), + ) + } } - c if c.is_whitespace() => Token::Backslash, - _ => Token::Text("\\"), - } - } else { - Token::Backslash + c if c.is_whitespace() => NodeKind::Linebreak, + _ => NodeKind::Text('\\'.into()), + }, + None => NodeKind::Linebreak, } } #[inline] - fn hash(&mut self) -> Token<'s> { + fn hash(&mut self) -> NodeKind { if self.s.check_or(false, is_id_start) { let read = self.s.eat_while(is_id_continue); - if let Some(keyword) = keyword(read) { - keyword - } else { - Token::Ident(read) + match keyword(read) { + Some(keyword) => keyword, + None => NodeKind::Ident(read.into()), } } else { - Token::Text("#") + NodeKind::Text("#".into()) } } - fn hyph(&mut self, start: usize) -> Token<'s> { + fn hyph(&mut self) -> NodeKind { if self.s.eat_if('-') { if self.s.eat_if('-') { - Token::HyphHyphHyph + NodeKind::EmDash } else { - Token::HyphHyph + NodeKind::EnDash } } else if self.s.check_or(true, char::is_whitespace) { - Token::Hyph + NodeKind::Minus } else { - Token::Text(self.s.eaten_from(start)) + NodeKind::Text("-".into()) } } - fn numbering(&mut self, start: usize, c: char) -> Token<'s> { + fn numbering(&mut self, start: usize, c: char) -> NodeKind { let number = if c != '.' { self.s.eat_while(|c| c.is_ascii_digit()); let read = self.s.eaten_from(start); if !self.s.eat_if('.') { - return Token::Text(read); + return NodeKind::Text(self.s.eaten_from(start).into()); } read.parse().ok() } else { @@ -282,13 +301,15 @@ impl<'s> Tokens<'s> { }; if self.s.check_or(true, char::is_whitespace) { - Token::Numbering(number) + NodeKind::EnumNumbering(number) } else { - Token::Text(self.s.eaten_from(start)) + NodeKind::Text(self.s.eaten_from(start).into()) } } - fn raw(&mut self) -> Token<'s> { + fn raw(&mut self) -> NodeKind { + let column = self.s.column(self.s.index() - 1); + let mut backticks = 1; while self.s.eat_if('`') { backticks += 1; @@ -296,7 +317,11 @@ impl<'s> Tokens<'s> { // Special case for empty inline block. if backticks == 2 { - return Token::Raw(RawToken { text: "", backticks: 1, terminated: true }); + return NodeKind::Raw(Rc::new(RawNode { + text: EcoString::new(), + lang: None, + block: false, + })); } let start = self.s.index(); @@ -310,17 +335,30 @@ impl<'s> Tokens<'s> { } } - let terminated = found == backticks; - let end = self.s.index() - if terminated { found } else { 0 }; + if found == backticks { + let end = self.s.index() - found as usize; + NodeKind::Raw(Rc::new(resolve_raw( + column, + backticks, + self.s.get(start .. end).into(), + ))) + } else { + let remaining = backticks - found; + let noun = if remaining == 1 { "backtick" } else { "backticks" }; - Token::Raw(RawToken { - text: self.s.get(start .. end), - backticks, - terminated, - }) + NodeKind::Error( + ErrorPos::End, + if found == 0 { + format!("expected {} {}", remaining, noun) + } else { + format!("expected {} more {}", remaining, noun) + } + .into(), + ) + } } - fn math(&mut self) -> Token<'s> { + fn math(&mut self) -> NodeKind { let mut display = false; if self.s.eat_if('[') { display = true; @@ -350,25 +388,36 @@ impl<'s> Tokens<'s> { (true, true) => 2, }; - Token::Math(MathToken { - formula: self.s.get(start .. end), - display, - terminated, - }) + if terminated { + NodeKind::Math(Rc::new(MathNode { + formula: self.s.get(start .. end).into(), + display, + })) + } else { + NodeKind::Error( + ErrorPos::End, + if !display || (!escaped && dollar) { + "expected closing dollar sign" + } else { + "expected closing bracket and dollar sign" + } + .into(), + ) + } } - fn ident(&mut self, start: usize) -> Token<'s> { + fn ident(&mut self, start: usize) -> NodeKind { self.s.eat_while(is_id_continue); match self.s.eaten_from(start) { - "none" => Token::None, - "auto" => Token::Auto, - "true" => Token::Bool(true), - "false" => Token::Bool(false), - id => keyword(id).unwrap_or(Token::Ident(id)), + "none" => NodeKind::None, + "auto" => NodeKind::Auto, + "true" => NodeKind::Bool(true), + "false" => NodeKind::Bool(false), + id => keyword(id).unwrap_or(NodeKind::Ident(id.into())), } } - fn number(&mut self, start: usize, c: char) -> Token<'s> { + fn number(&mut self, start: usize, c: char) -> NodeKind { // Read the first part (integer or fractional depending on `first`). self.s.eat_while(|c| c.is_ascii_digit()); @@ -396,55 +445,56 @@ impl<'s> Tokens<'s> { // Find out whether it is a simple number. if suffix.is_empty() { - if let Ok(int) = number.parse::<i64>() { - return Token::Int(int); - } else if let Ok(float) = number.parse::<f64>() { - return Token::Float(float); + if let Ok(i) = number.parse::<i64>() { + return NodeKind::Int(i); } } - // Otherwise parse into the fitting numeric type. - let build = match suffix { - "%" => Token::Percent, - "fr" => Token::Fraction, - "pt" => |x| Token::Length(x, LengthUnit::Pt), - "mm" => |x| Token::Length(x, LengthUnit::Mm), - "cm" => |x| Token::Length(x, LengthUnit::Cm), - "in" => |x| Token::Length(x, LengthUnit::In), - "rad" => |x| Token::Angle(x, AngularUnit::Rad), - "deg" => |x| Token::Angle(x, AngularUnit::Deg), - _ => return Token::Invalid(all), - }; - - if let Ok(float) = number.parse::<f64>() { - build(float) + if let Ok(f) = number.parse::<f64>() { + match suffix { + "" => NodeKind::Float(f), + "%" => NodeKind::Percentage(f), + "fr" => NodeKind::Fraction(f), + "pt" => NodeKind::Length(f, LengthUnit::Pt), + "mm" => NodeKind::Length(f, LengthUnit::Mm), + "cm" => NodeKind::Length(f, LengthUnit::Cm), + "in" => NodeKind::Length(f, LengthUnit::In), + "deg" => NodeKind::Angle(f, AngularUnit::Deg), + "rad" => NodeKind::Angle(f, AngularUnit::Rad), + _ => { + return NodeKind::Unknown(all.into()); + } + } } else { - Token::Invalid(all) + NodeKind::Unknown(all.into()) } } - fn string(&mut self) -> Token<'s> { + + fn string(&mut self) -> NodeKind { let mut escaped = false; - Token::Str(StrToken { - string: self.s.eat_until(|c| { - if c == '"' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }), - terminated: self.s.eat_if('"'), - }) - } + let string = resolve_string(self.s.eat_until(|c| { + if c == '"' && !escaped { + true + } else { + escaped = c == '\\' && !escaped; + false + } + })); - fn line_comment(&mut self) -> Token<'s> { - Token::LineComment(self.s.eat_until(is_newline)) + if self.s.eat_if('"') { + NodeKind::Str(string) + } else { + NodeKind::Error(ErrorPos::End, "expected quote".into()) + } } - fn block_comment(&mut self) -> Token<'s> { - let start = self.s.index(); + fn line_comment(&mut self) -> NodeKind { + self.s.eat_until(is_newline); + NodeKind::LineComment + } + fn block_comment(&mut self) -> NodeKind { let mut state = '_'; let mut depth = 1; @@ -466,10 +516,7 @@ impl<'s> Tokens<'s> { } } - let terminated = depth == 0; - let end = self.s.index() - if terminated { 2 } else { 0 }; - - Token::BlockComment(self.s.get(start .. end)) + NodeKind::BlockComment } fn maybe_in_url(&self) -> bool { @@ -477,24 +524,24 @@ impl<'s> Tokens<'s> { } } -fn keyword(ident: &str) -> Option<Token<'static>> { +fn keyword(ident: &str) -> Option<NodeKind> { Some(match ident { - "not" => Token::Not, - "and" => Token::And, - "or" => Token::Or, - "with" => Token::With, - "let" => Token::Let, - "if" => Token::If, - "else" => Token::Else, - "for" => Token::For, - "in" => Token::In, - "while" => Token::While, - "break" => Token::Break, - "continue" => Token::Continue, - "return" => Token::Return, - "import" => Token::Import, - "include" => Token::Include, - "from" => Token::From, + "not" => NodeKind::Not, + "and" => NodeKind::And, + "or" => NodeKind::Or, + "with" => NodeKind::With, + "let" => NodeKind::Let, + "if" => NodeKind::If, + "else" => NodeKind::Else, + "for" => NodeKind::For, + "in" => NodeKind::In, + "while" => NodeKind::While, + "break" => NodeKind::Break, + "continue" => NodeKind::Continue, + "return" => NodeKind::Return, + "import" => NodeKind::Import, + "include" => NodeKind::Include, + "from" => NodeKind::From, _ => return None, }) } @@ -506,24 +553,45 @@ mod tests { use super::*; + use ErrorPos::*; + use NodeKind::*; use Option::None; - use Token::{Ident, *}; use TokenMode::{Code, Markup}; - const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token { - Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated }) + fn UnicodeEscape(c: char) -> NodeKind { + NodeKind::UnicodeEscape(c) } - const fn Raw(text: &str, backticks: usize, terminated: bool) -> Token { - Token::Raw(RawToken { text, backticks, terminated }) + fn Error(pos: ErrorPos, message: &str) -> NodeKind { + NodeKind::Error(pos, message.into()) } - const fn Math(formula: &str, display: bool, terminated: bool) -> Token { - Token::Math(MathToken { formula, display, terminated }) + fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind { + NodeKind::Raw(Rc::new(RawNode { + text: text.into(), + lang: lang.map(Into::into), + block, + })) } - const fn Str(string: &str, terminated: bool) -> Token { - Token::Str(StrToken { string, terminated }) + fn Math(formula: &str, display: bool) -> NodeKind { + NodeKind::Math(Rc::new(MathNode { formula: formula.into(), display })) + } + + fn Str(string: &str) -> NodeKind { + NodeKind::Str(string.into()) + } + + fn Text(string: &str) -> NodeKind { + NodeKind::Text(string.into()) + } + + fn Ident(ident: &str) -> NodeKind { + NodeKind::Ident(ident.into()) + } + + fn Invalid(invalid: &str) -> NodeKind { + NodeKind::Unknown(invalid.into()) } /// Building blocks for suffix testing. @@ -541,40 +609,6 @@ mod tests { /// - '/': symbols const BLOCKS: &str = " a1/"; - /// Suffixes described by four-tuples of: - /// - /// - block the suffix is part of - /// - mode in which the suffix is applicable - /// - the suffix string - /// - the resulting suffix token - const SUFFIXES: &[(char, Option<TokenMode>, &str, Token)] = &[ - // Whitespace suffixes. - (' ', None, " ", Space(0)), - (' ', None, "\n", Space(1)), - (' ', None, "\r", Space(1)), - (' ', None, "\r\n", Space(1)), - // Letter suffixes. - ('a', Some(Markup), "hello", Text("hello")), - ('a', Some(Markup), "💚", Text("💚")), - ('a', Some(Code), "val", Ident("val")), - ('a', Some(Code), "α", Ident("α")), - ('a', Some(Code), "_", Ident("_")), - // Number suffixes. - ('1', Some(Code), "2", Int(2)), - ('1', Some(Code), ".2", Float(0.2)), - // Symbol suffixes. - ('/', None, "[", LeftBracket), - ('/', None, "//", LineComment("")), - ('/', None, "/**/", BlockComment("")), - ('/', Some(Markup), "*", Star), - ('/', Some(Markup), "$ $", Math(" ", false, true)), - ('/', Some(Markup), r"\\", Text(r"\")), - ('/', Some(Markup), "#let", Let), - ('/', Some(Code), "(", LeftParen), - ('/', Some(Code), ":", Colon), - ('/', Some(Code), "+=", PlusEq), - ]; - macro_rules! t { (Both $($tts:tt)*) => { t!(Markup $($tts)*); @@ -584,8 +618,42 @@ mod tests { // Test without suffix. t!(@$mode: $src => $($token),*); + // Suffixes described by four-tuples of: + // + // - block the suffix is part of + // - mode in which the suffix is applicable + // - the suffix string + // - the resulting suffix NodeKind + let suffixes: &[(char, Option<TokenMode>, &str, NodeKind)] = &[ + // Whitespace suffixes. + (' ', None, " ", Space(0)), + (' ', None, "\n", Space(1)), + (' ', None, "\r", Space(1)), + (' ', None, "\r\n", Space(1)), + // Letter suffixes. + ('a', Some(Markup), "hello", Text("hello")), + ('a', Some(Markup), "💚", Text("💚")), + ('a', Some(Code), "val", Ident("val")), + ('a', Some(Code), "α", Ident("α")), + ('a', Some(Code), "_", Ident("_")), + // Number suffixes. + ('1', Some(Code), "2", Int(2)), + ('1', Some(Code), ".2", Float(0.2)), + // Symbol suffixes. + ('/', None, "[", LeftBracket), + ('/', None, "//", LineComment), + ('/', None, "/**/", BlockComment), + ('/', Some(Markup), "*", Strong), + ('/', Some(Markup), "$ $", Math(" ", false)), + ('/', Some(Markup), r"\\", Text("\\")), + ('/', Some(Markup), "#let", Let), + ('/', Some(Code), "(", LeftParen), + ('/', Some(Code), ":", Colon), + ('/', Some(Code), "+=", PlusEq), + ]; + // Test with each applicable suffix. - for &(block, mode, suffix, token) in SUFFIXES { + for &(block, mode, suffix, ref token) in suffixes { let src = $src; #[allow(unused_variables)] let blocks = BLOCKS; @@ -599,7 +667,7 @@ mod tests { (@$mode:ident: $src:expr => $($token:expr),*) => {{ let src = $src; let found = Tokens::new(&src, $mode).collect::<Vec<_>>(); - let expected = vec![$($token),*]; + let expected = vec![$($token.clone()),*]; check(&src, found, expected); }}; } @@ -671,7 +739,7 @@ mod tests { // Test text ends. t!(Markup[""]: "hello " => Text("hello"), Space(0)); - t!(Markup[""]: "hello~" => Text("hello"), Tilde); + t!(Markup[""]: "hello~" => Text("hello"), NonBreakingSpace); } #[test] @@ -698,31 +766,31 @@ mod tests { t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\"")); // Test basic unicode escapes. - t!(Markup: r"\u{}" => UnicodeEscape("", true)); - t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true)); - t!(Markup: r"\u{P}" => UnicodeEscape("P", true)); + t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence")); + t!(Markup: r"\u{2603}" => UnicodeEscape('☃')); + t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence")); // Test unclosed unicode escapes. - t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false)); - t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false)); - t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false)); - t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false)); - t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace); + t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace")); + t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace); } #[test] fn test_tokenize_markup_symbols() { // Test markup tokens. - t!(Markup[" a1"]: "*" => Star); - t!(Markup: "_" => Underscore); + t!(Markup[" a1"]: "*" => Strong); + t!(Markup: "_" => Emph); t!(Markup[""]: "===" => Eq, Eq, Eq); t!(Markup["a1/"]: "= " => Eq, Space(0)); - t!(Markup: "~" => Tilde); - t!(Markup[" "]: r"\" => Backslash); - t!(Markup["a "]: r"a--" => Text("a"), HyphHyph); - t!(Markup["a1/"]: "- " => Hyph, Space(0)); - t!(Markup[" "]: "." => Numbering(None)); - t!(Markup[" "]: "1." => Numbering(Some(1))); + t!(Markup: "~" => NonBreakingSpace); + t!(Markup[" "]: r"\" => Linebreak); + t!(Markup["a "]: r"a--" => Text("a"), EnDash); + t!(Markup["a1/"]: "- " => Minus, Space(0)); + t!(Markup[" "]: "." => EnumNumbering(None)); + t!(Markup[" "]: "1." => EnumNumbering(Some(1))); t!(Markup[" "]: "1.a" => Text("1."), Text("a")); t!(Markup[" /"]: "a1." => Text("a1.")); } @@ -734,7 +802,7 @@ mod tests { t!(Code: ";" => Semicolon); t!(Code: ":" => Colon); t!(Code: "+" => Plus); - t!(Code: "-" => Hyph); + t!(Code: "-" => Minus); t!(Code[" a1"]: "*" => Star); t!(Code[" a1"]: "/" => Slash); t!(Code: "=" => Eq); @@ -756,10 +824,10 @@ mod tests { t!(Code[" a/"]: "..." => Dots, Invalid(".")); // Test hyphen as symbol vs part of identifier. - t!(Code[" /"]: "-1" => Hyph, Int(1)); - t!(Code[" /"]: "-a" => Hyph, Ident("a")); - t!(Code[" /"]: "--1" => Hyph, Hyph, Int(1)); - t!(Code[" /"]: "--_a" => Hyph, Hyph, Ident("_a")); + t!(Code[" /"]: "-1" => Minus, Int(1)); + t!(Code[" /"]: "-a" => Minus, Ident("a")); + t!(Code[" /"]: "--1" => Minus, Minus, Int(1)); + t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a")); t!(Code[" /"]: "a-b" => Ident("a-b")); } @@ -776,13 +844,13 @@ mod tests { ("import", Import), ]; - for &(s, t) in &list { + for (s, t) in list.clone() { t!(Markup[" "]: format!("#{}", s) => t); t!(Markup[" "]: format!("#{0}#{0}", s) => t, t); - t!(Markup[" /"]: format!("# {}", s) => Token::Text("#"), Space(0), Text(s)); + t!(Markup[" /"]: format!("# {}", s) => Text("#"), Space(0), Text(s)); } - for &(s, t) in &list { + for (s, t) in list { t!(Code[" "]: s => t); t!(Markup[" /"]: s => Text(s)); } @@ -796,45 +864,43 @@ mod tests { #[test] fn test_tokenize_raw_blocks() { - let empty = Raw("", 1, true); - // Test basic raw block. - t!(Markup: "``" => empty); - t!(Markup: "`raw`" => Raw("raw", 1, true)); - t!(Markup[""]: "`]" => Raw("]", 1, false)); + t!(Markup: "``" => Raw("", None, false)); + t!(Markup: "`raw`" => Raw("raw", None, false)); + t!(Markup[""]: "`]" => Error(End, "expected 1 backtick")); // Test special symbols in raw block. - t!(Markup: "`[brackets]`" => Raw("[brackets]", 1, true)); - t!(Markup[""]: r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); + t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false)); + t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick")); // Test separated closing backticks. - t!(Markup: "```not `y`e`t```" => Raw("not `y`e`t", 3, true)); + t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false)); // Test more backticks. - t!(Markup: "``nope``" => empty, Text("nope"), empty); - t!(Markup: "````🚀````" => Raw("🚀", 4, true)); - t!(Markup[""]: "`````👩🚀````noend" => Raw("👩🚀````noend", 5, false)); - t!(Markup[""]: "````raw``````" => Raw("raw", 4, true), empty); + t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false)); + t!(Markup: "````🚀````" => Raw("", None, false)); + t!(Markup[""]: "`````👩🚀````noend" => Error(End, "expected 5 backticks")); + t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false)); } #[test] fn test_tokenize_math_formulas() { // Test basic formula. - t!(Markup: "$$" => Math("", false, true)); - t!(Markup: "$x$" => Math("x", false, true)); - t!(Markup: r"$\\$" => Math(r"\\", false, true)); - t!(Markup: "$[x + y]$" => Math("x + y", true, true)); - t!(Markup: r"$[\\]$" => Math(r"\\", true, true)); + t!(Markup: "$$" => Math("", false)); + t!(Markup: "$x$" => Math("x", false)); + t!(Markup: r"$\\$" => Math(r"\\", false)); + t!(Markup: "$[x + y]$" => Math("x + y", true)); + t!(Markup: r"$[\\]$" => Math(r"\\", true)); // Test unterminated. - t!(Markup[""]: "$x" => Math("x", false, false)); - t!(Markup[""]: "$[x" => Math("x", true, false)); - t!(Markup[""]: "$[x]\n$" => Math("x]\n$", true, false)); + t!(Markup[""]: "$x" => Error(End, "expected closing dollar sign")); + t!(Markup[""]: "$[x" => Error(End, "expected closing bracket and dollar sign")); + t!(Markup[""]: "$[x]\n$" => Error(End, "expected closing bracket and dollar sign")); // Test escape sequences. - t!(Markup: r"$\$x$" => Math(r"\$x", false, true)); - t!(Markup: r"$[\\\]$]$" => Math(r"\\\]$", true, true)); - t!(Markup[""]: r"$[ ]\\$" => Math(r" ]\\$", true, false)); + t!(Markup: r"$\$x$" => Math(r"\$x", false)); + t!(Markup: r"$[\\\]$]$" => Math(r"\\\]$", true)); + t!(Markup[""]: r"$[ ]\\$" => Error(End, "expected closing bracket and dollar sign")); } #[test] @@ -896,8 +962,8 @@ mod tests { let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats); let suffixes = [ - ("%", Percent as fn(f64) -> Token<'static>), - ("fr", Fraction as fn(f64) -> Token<'static>), + ("%", Percentage as fn(f64) -> NodeKind), + ("fr", Fraction as fn(f64) -> NodeKind), ("mm", |x| Length(x, LengthUnit::Mm)), ("pt", |x| Length(x, LengthUnit::Pt)), ("cm", |x| Length(x, LengthUnit::Cm)), @@ -922,62 +988,62 @@ mod tests { #[test] fn test_tokenize_strings() { // Test basic strings. - t!(Code: "\"hi\"" => Str("hi", true)); - t!(Code: "\"hi\nthere\"" => Str("hi\nthere", true)); - t!(Code: "\"🌎\"" => Str("🌎", true)); + t!(Code: "\"hi\"" => Str("hi")); + t!(Code: "\"hi\nthere\"" => Str("hi\nthere")); + t!(Code: "\"🌎\"" => Str("🌎")); // Test unterminated. - t!(Code[""]: "\"hi" => Str("hi", false)); + t!(Code[""]: "\"hi" => Error(End, "expected quote")); // Test escaped quote. - t!(Code: r#""a\"bc""# => Str(r#"a\"bc"#, true)); - t!(Code[""]: r#""\""# => Str(r#"\""#, false)); + t!(Code: r#""a\"bc""# => Str("a\"bc")); + t!(Code[""]: r#""\""# => Error(End, "expected quote")); } #[test] fn test_tokenize_line_comments() { // Test line comment with no trailing newline. - t!(Both[""]: "//" => LineComment("")); + t!(Both[""]: "//" => LineComment); // Test line comment ends at newline. - t!(Both["a1/"]: "//bc\n" => LineComment("bc"), Space(1)); - t!(Both["a1/"]: "// bc \n" => LineComment(" bc "), Space(1)); - t!(Both["a1/"]: "//bc\r\n" => LineComment("bc"), Space(1)); + t!(Both["a1/"]: "//bc\n" => LineComment, Space(1)); + t!(Both["a1/"]: "// bc \n" => LineComment, Space(1)); + t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1)); // Test nested line comments. - t!(Both["a1/"]: "//a//b\n" => LineComment("a//b"), Space(1)); + t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1)); } #[test] fn test_tokenize_block_comments() { // Test basic block comments. - t!(Both[""]: "/*" => BlockComment("")); - t!(Both: "/**/" => BlockComment("")); - t!(Both: "/*🏞*/" => BlockComment("🏞")); - t!(Both: "/*\n*/" => BlockComment("\n")); + t!(Both[""]: "/*" => BlockComment); + t!(Both: "/**/" => BlockComment); + t!(Both: "/*🏞*/" => BlockComment); + t!(Both: "/*\n*/" => BlockComment); // Test depth 1 and 2 nested block comments. - t!(Both: "/* /* */ */" => BlockComment(" /* */ ")); - t!(Both: "/*/*/**/*/*/" => BlockComment("/*/**/*/")); + t!(Both: "/* /* */ */" => BlockComment); + t!(Both: "/*/*/**/*/*/" => BlockComment); // Test two nested, one unclosed block comments. - t!(Both[""]: "/*/*/**/*/" => BlockComment("/*/**/*/")); + t!(Both[""]: "/*/*/**/*/" => BlockComment); // Test all combinations of up to two following slashes and stars. - t!(Both[""]: "/*" => BlockComment("")); - t!(Both[""]: "/*/" => BlockComment("/")); - t!(Both[""]: "/**" => BlockComment("*")); - t!(Both[""]: "/*//" => BlockComment("//")); - t!(Both[""]: "/*/*" => BlockComment("/*")); - t!(Both[""]: "/**/" => BlockComment("")); - t!(Both[""]: "/***" => BlockComment("**")); + t!(Both[""]: "/*" => BlockComment); + t!(Both[""]: "/*/" => BlockComment); + t!(Both[""]: "/**" => BlockComment); + t!(Both[""]: "/*//" => BlockComment); + t!(Both[""]: "/*/*" => BlockComment); + t!(Both[""]: "/**/" => BlockComment); + t!(Both[""]: "/***" => BlockComment); } #[test] fn test_tokenize_invalid() { // Test invalidly closed block comments. - t!(Both: "*/" => Token::Invalid("*/")); - t!(Both: "/**/*/" => BlockComment(""), Token::Invalid("*/")); + t!(Both: "*/" => Invalid("*/")); + t!(Both: "/**/*/" => BlockComment, Invalid("*/")); // Test invalid expressions. t!(Code: r"\" => Invalid(r"\")); @@ -990,6 +1056,6 @@ mod tests { // Test invalid number suffixes. t!(Code[" /"]: "1foo" => Invalid("1foo")); t!(Code: "1p%" => Invalid("1p"), Invalid("%")); - t!(Code: "1%%" => Percent(1.0), Invalid("%")); + t!(Code: "1%%" => Percentage(1.0), Invalid("%")); } } |
