summaryrefslogtreecommitdiff
path: root/crates/typst-syntax/src
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2024-11-04 10:17:49 +0100
committerGitHub <noreply@github.com>2024-11-04 10:17:49 +0100
commitcb1aad3a0cc862c5ff57a557e196ba49a02917de (patch)
tree80cd62cbeb0f8d2bb999cc984d213f42293ebd24 /crates/typst-syntax/src
parent6b636167ef2e84c761777261ce1ca3087a75f765 (diff)
parent2c9728f53b318a6cae092f30ad0956a536af7ccb (diff)
Refactor Parser (#5310)
Diffstat (limited to 'crates/typst-syntax/src')
-rw-r--r--crates/typst-syntax/src/lexer.rs190
-rw-r--r--crates/typst-syntax/src/parser.rs1287
-rw-r--r--crates/typst-syntax/src/reparser.rs8
-rw-r--r--crates/typst-syntax/src/set.rs3
4 files changed, 840 insertions, 648 deletions
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 721225c6..1314016f 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -4,20 +4,18 @@ use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation;
use unscanny::Scanner;
-use crate::{SyntaxError, SyntaxKind};
+use crate::{SyntaxError, SyntaxKind, SyntaxNode};
-/// Splits up a string of source code into tokens.
+/// An iterator over a source code string which returns tokens.
#[derive(Clone)]
pub(super) struct Lexer<'s> {
- /// The underlying scanner.
+ /// The scanner: contains the underlying string and location as a "cursor".
s: Scanner<'s>,
/// The mode the lexer is in. This determines which kinds of tokens it
/// produces.
mode: LexMode,
/// Whether the last token contained a newline.
newline: bool,
- /// The state held by raw line lexing.
- raw: Vec<(SyntaxKind, usize)>,
/// An error for the last token.
error: Option<SyntaxError>,
}
@@ -31,8 +29,6 @@ pub(super) enum LexMode {
Math,
/// Keywords, literals and operators.
Code,
- /// The contents of a raw block.
- Raw,
}
impl<'s> Lexer<'s> {
@@ -44,7 +40,6 @@ impl<'s> Lexer<'s> {
mode,
newline: false,
error: None,
- raw: Vec::new(),
}
}
@@ -74,9 +69,11 @@ impl<'s> Lexer<'s> {
self.newline
}
- /// Take out the last error, if any.
- pub fn take_error(&mut self) -> Option<SyntaxError> {
- self.error.take()
+ /// The number of characters until the most recent newline from an index.
+ pub fn column(&self, index: usize) -> usize {
+ let mut s = self.s; // Make a new temporary scanner (cheap).
+ s.jump(index);
+ s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
}
}
@@ -97,21 +94,14 @@ impl Lexer<'_> {
/// Shared methods with all [`LexMode`].
impl Lexer<'_> {
- /// Proceed to the next token and return its [`SyntaxKind`]. Note the
- /// token could be a [trivia](SyntaxKind::is_trivia).
- pub fn next(&mut self) -> SyntaxKind {
- if self.mode == LexMode::Raw {
- let Some((kind, end)) = self.raw.pop() else {
- return SyntaxKind::End;
- };
- self.s.jump(end);
- return kind;
- }
+ /// Return the next token in our text. Returns both the [`SyntaxNode`]
+ /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
+ pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
+ debug_assert!(self.error.is_none());
+ let start = self.s.cursor();
self.newline = false;
- self.error = None;
- let start = self.s.cursor();
- match self.s.eat() {
+ let kind = match self.s.eat() {
Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
Some('/') if self.s.eat_if('/') => self.line_comment(),
Some('/') if self.s.eat_if('*') => self.block_comment(),
@@ -123,22 +113,32 @@ impl Lexer<'_> {
);
kind
}
-
+ Some('`') if self.mode != LexMode::Math => return self.raw(),
Some(c) => match self.mode {
LexMode::Markup => self.markup(start, c),
- LexMode::Math => self.math(start, c),
+ LexMode::Math => match self.math(start, c) {
+ (kind, None) => kind,
+ (kind, Some(node)) => return (kind, node),
+ },
LexMode::Code => self.code(start, c),
- LexMode::Raw => unreachable!(),
},
None => SyntaxKind::End,
- }
+ };
+
+ let text = self.s.from(start);
+ let node = match self.error.take() {
+ Some(error) => SyntaxNode::error(error, text),
+ None => SyntaxNode::leaf(kind, text),
+ };
+ (kind, node)
}
/// Eat whitespace characters greedily.
fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
let more = self.s.eat_while(|c| is_space(c, self.mode));
let newlines = match c {
+ // Optimize eating a single space.
' ' if more.is_empty() => 0,
_ => count_newlines(self.s.from(start)),
};
@@ -187,7 +187,6 @@ impl Lexer<'_> {
fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
match c {
'\\' => self.backslash(),
- '`' => self.raw(),
'h' if self.s.eat_if("ttp://") => self.link(),
'h' if self.s.eat_if("ttps://") => self.link(),
'<' if self.s.at(is_id_continue) => self.label(),
@@ -252,9 +251,10 @@ impl Lexer<'_> {
}
}
- fn raw(&mut self) -> SyntaxKind {
+ /// Lex an entire raw segment at once. This is a convenience to avoid going
+ /// to and from the parser for each raw section.
+ fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
let start = self.s.cursor() - 1;
- self.raw.clear();
// Determine number of opening backticks.
let mut backticks = 1;
@@ -264,9 +264,11 @@ impl Lexer<'_> {
// Special case for ``.
if backticks == 2 {
- self.push_raw(SyntaxKind::RawDelim);
- self.s.jump(start + 1);
- return SyntaxKind::RawDelim;
+ let nodes = vec![
+ SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
+ SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
+ ];
+ return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
}
// Find end of raw text.
@@ -275,43 +277,55 @@ impl Lexer<'_> {
match self.s.eat() {
Some('`') => found += 1,
Some(_) => found = 0,
- None => break,
+ None => {
+ let msg = SyntaxError::new("unclosed raw text");
+ let error = SyntaxNode::error(msg, self.s.from(start));
+ return (SyntaxKind::Error, error);
+ }
}
}
+ let end = self.s.cursor();
- if found != backticks {
- return self.error("unclosed raw text");
- }
+ let mut nodes = Vec::with_capacity(3); // Will have at least 3.
+
+ // A closure for pushing a node onto our raw vector. Assumes the caller
+ // will move the scanner to the next location at each step.
+ let mut prev_start = start;
+ let mut push_raw = |kind, s: &Scanner| {
+ nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
+ prev_start = s.cursor();
+ };
+
+ // Opening delimiter.
+ self.s.jump(start + backticks);
+ push_raw(SyntaxKind::RawDelim, &self.s);
- let end = self.s.cursor();
if backticks >= 3 {
- self.blocky_raw(start, end, backticks);
+ self.blocky_raw(end - backticks, &mut push_raw);
} else {
- self.inline_raw(start, end, backticks);
+ self.inline_raw(end - backticks, &mut push_raw);
}
// Closing delimiter.
- self.push_raw(SyntaxKind::RawDelim);
-
- // The saved tokens will be removed in reverse.
- self.raw.reverse();
+ self.s.jump(end);
+ push_raw(SyntaxKind::RawDelim, &self.s);
- // Opening delimiter.
- self.s.jump(start + backticks);
- SyntaxKind::RawDelim
+ (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
}
- fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
+ fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
+ where
+ F: FnMut(SyntaxKind, &Scanner),
+ {
// Language tag.
- self.s.jump(start + backticks);
if self.s.eat_if(is_id_start) {
self.s.eat_while(is_id_continue);
- self.push_raw(SyntaxKind::RawLang);
+ push_raw(SyntaxKind::RawLang, &self.s);
}
// Determine inner content between backticks.
self.s.eat_if(' ');
- let inner = self.s.to(end - backticks);
+ let inner = self.s.to(inner_end);
// Determine dedent level.
let mut lines = split_newlines(inner);
@@ -357,41 +371,32 @@ impl Lexer<'_> {
let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
self.s.eat_newline();
self.s.advance(offset);
- self.push_raw(SyntaxKind::RawTrimmed);
+ push_raw(SyntaxKind::RawTrimmed, &self.s);
self.s.advance(line.len() - offset);
- self.push_raw(SyntaxKind::Text);
+ push_raw(SyntaxKind::Text, &self.s);
}
// Add final trimmed.
- if self.s.cursor() < end - backticks {
- self.s.jump(end - backticks);
- self.push_raw(SyntaxKind::RawTrimmed);
+ if self.s.cursor() < inner_end {
+ self.s.jump(inner_end);
+ push_raw(SyntaxKind::RawTrimmed, &self.s);
}
- self.s.jump(end);
}
- fn inline_raw(&mut self, start: usize, end: usize, backticks: usize) {
- self.s.jump(start + backticks);
-
- while self.s.cursor() < end - backticks {
+ fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
+ where
+ F: FnMut(SyntaxKind, &Scanner),
+ {
+ while self.s.cursor() < inner_end {
if self.s.at(is_newline) {
- self.push_raw(SyntaxKind::Text);
+ push_raw(SyntaxKind::Text, &self.s);
self.s.eat_newline();
- self.push_raw(SyntaxKind::RawTrimmed);
+ push_raw(SyntaxKind::RawTrimmed, &self.s);
continue;
}
self.s.eat();
}
- self.push_raw(SyntaxKind::Text);
-
- self.s.jump(end);
- }
-
- /// Push the current cursor that marks the end of a raw segment of
- /// the given `kind`.
- fn push_raw(&mut self, kind: SyntaxKind) {
- let end = self.s.cursor();
- self.raw.push((kind, end));
+ push_raw(SyntaxKind::Text, &self.s);
}
fn link(&mut self) -> SyntaxKind {
@@ -512,8 +517,8 @@ impl Lexer<'_> {
/// Math.
impl Lexer<'_> {
- fn math(&mut self, start: usize, c: char) -> SyntaxKind {
- match c {
+ fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
+ let kind = match c {
'\\' => self.backslash(),
'"' => self.string(),
@@ -566,11 +571,41 @@ impl Lexer<'_> {
// Identifiers.
c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
self.s.eat_while(is_math_id_continue);
- SyntaxKind::MathIdent
+ let (kind, node) = self.math_ident_or_field(start);
+ return (kind, Some(node));
}
// Other math atoms.
_ => self.math_text(start, c),
+ };
+ (kind, None)
+ }
+
+ /// Parse a single `MathIdent` or an entire `FieldAccess`.
+ fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
+ let mut kind = SyntaxKind::MathIdent;
+ let mut node = SyntaxNode::leaf(kind, self.s.from(start));
+ while let Some(ident) = self.maybe_dot_ident() {
+ kind = SyntaxKind::FieldAccess;
+ let field_children = vec![
+ node,
+ SyntaxNode::leaf(SyntaxKind::Dot, '.'),
+ SyntaxNode::leaf(SyntaxKind::Ident, ident),
+ ];
+ node = SyntaxNode::inner(kind, field_children);
+ }
+ (kind, node)
+ }
+
+ /// If at a dot and a math identifier, eat and return the identifier.
+ fn maybe_dot_ident(&mut self) -> Option<&str> {
+ if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
+ let ident_start = self.s.cursor();
+ self.s.eat();
+ self.s.eat_while(is_math_id_continue);
+ Some(self.s.from(ident_start))
+ } else {
+ None
}
}
@@ -599,7 +634,6 @@ impl Lexer<'_> {
impl Lexer<'_> {
fn code(&mut self, start: usize, c: char) -> SyntaxKind {
match c {
- '`' => self.raw(),
'<' if self.s.at(is_id_continue) => self.label(),
'0'..='9' => self.number(start, c),
'.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index a8bec626..ea5b9155 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -6,66 +6,57 @@ use ecow::{eco_format, EcoString};
use unicode_math_class::MathClass;
use crate::set::{syntax_set, SyntaxSet};
-use crate::{
- ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode,
-};
+use crate::{ast, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode};
-/// Parses a source file.
+/// Parses a source file as top-level markup.
pub fn parse(text: &str) -> SyntaxNode {
let _scope = typst_timing::TimingScope::new("parse");
let mut p = Parser::new(text, 0, LexMode::Markup);
- markup(&mut p, true, 0, |_| false);
- p.finish().into_iter().next().unwrap()
+ markup_exprs(&mut p, true, syntax_set!(End));
+ p.finish_into(SyntaxKind::Markup)
}
/// Parses top-level code.
pub fn parse_code(text: &str) -> SyntaxNode {
let _scope = typst_timing::TimingScope::new("parse code");
let mut p = Parser::new(text, 0, LexMode::Code);
- let m = p.marker();
- p.skip();
- code_exprs(&mut p, |_| false);
- p.wrap_all(m, SyntaxKind::Code);
- p.finish().into_iter().next().unwrap()
+ code_exprs(&mut p, syntax_set!(End));
+ p.finish_into(SyntaxKind::Code)
}
/// Parses top-level math.
pub fn parse_math(text: &str) -> SyntaxNode {
let _scope = typst_timing::TimingScope::new("parse math");
let mut p = Parser::new(text, 0, LexMode::Math);
- math(&mut p, |_| false);
- p.finish().into_iter().next().unwrap()
+ math_exprs(&mut p, syntax_set!(End));
+ p.finish_into(SyntaxKind::Math)
}
-/// Parses the contents of a file or content block.
-fn markup(
- p: &mut Parser,
- mut at_start: bool,
- min_indent: usize,
- mut stop: impl FnMut(&Parser) -> bool,
-) {
- let m = p.marker();
+/// Parses markup expressions until a stop condition is met.
+fn markup(p: &mut Parser, at_start: bool, wrap_trivia: bool, stop_set: SyntaxSet) {
+ let m = if wrap_trivia { p.before_trivia() } else { p.marker() };
+ markup_exprs(p, at_start, stop_set);
+ if wrap_trivia {
+ p.flush_trivia();
+ }
+ p.wrap(m, SyntaxKind::Markup);
+}
+
+/// Parses a sequence of markup expressions.
+fn markup_exprs(p: &mut Parser, mut at_start: bool, stop_set: SyntaxSet) {
+ debug_assert!(stop_set.contains(SyntaxKind::End));
+ at_start |= p.had_newline();
let mut nesting: usize = 0;
- while !p.end() {
+ loop {
match p.current() {
SyntaxKind::LeftBracket => nesting += 1,
SyntaxKind::RightBracket if nesting > 0 => nesting -= 1,
- _ if stop(p) => break,
+ _ if p.at_set(stop_set) => break,
_ => {}
}
-
- if p.newline() {
- at_start = true;
- if min_indent > 0 && p.column(p.current_end()) < min_indent {
- break;
- }
- p.eat();
- continue;
- }
-
- markup_expr(p, &mut at_start);
+ markup_expr(p, at_start);
+ at_start = p.had_newline();
}
- p.wrap(m, SyntaxKind::Markup);
}
/// Reparses a subsection of markup incrementally.
@@ -74,40 +65,29 @@ pub(super) fn reparse_markup(
range: Range<usize>,
at_start: &mut bool,
nesting: &mut usize,
- mut stop: impl FnMut(SyntaxKind) -> bool,
+ top_level: bool,
) -> Option<Vec<SyntaxNode>> {
let mut p = Parser::new(text, range.start, LexMode::Markup);
- while !p.end() && p.current_start() < range.end {
+ *at_start |= p.had_newline();
+ while p.current_start() < range.end {
match p.current() {
SyntaxKind::LeftBracket => *nesting += 1,
SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1,
- _ if stop(p.current()) => break,
+ SyntaxKind::RightBracket if !top_level => break,
+ SyntaxKind::End => break,
_ => {}
}
-
- if p.newline() {
- *at_start = true;
- p.eat();
- continue;
- }
-
- markup_expr(&mut p, at_start);
+ markup_expr(&mut p, *at_start);
+ *at_start = p.had_newline();
}
(p.balanced && p.current_start() == range.end).then(|| p.finish())
}
-/// Parses a single markup expression: This includes markup elements like
-/// spaces, text, and headings, and embedded code expressions.
-fn markup_expr(p: &mut Parser, at_start: &mut bool) {
+/// Parses a single markup expression. This includes markup elements like text,
+/// headings, strong/emph, lists/enums, etc. This is also the entry point for
+/// parsing math equations and embedded code expressions.
+fn markup_expr(p: &mut Parser, at_start: bool) {
match p.current() {
- SyntaxKind::Space
- | SyntaxKind::Parbreak
- | SyntaxKind::LineComment
- | SyntaxKind::BlockComment => {
- p.eat();
- return;
- }
-
SyntaxKind::Text
| SyntaxKind::Linebreak
| SyntaxKind::Escape
@@ -116,14 +96,15 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
| SyntaxKind::Link
| SyntaxKind::Label => p.eat(),
+ SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
+
SyntaxKind::Hash => embedded_code_expr(p),
SyntaxKind::Star => strong(p),
SyntaxKind::Underscore => emph(p),
- SyntaxKind::RawDelim => raw(p),
- SyntaxKind::HeadingMarker if *at_start => heading(p),
- SyntaxKind::ListMarker if *at_start => list_item(p),
- SyntaxKind::EnumMarker if *at_start => enum_item(p),
- SyntaxKind::TermMarker if *at_start => term_item(p),
+ SyntaxKind::HeadingMarker if at_start => heading(p),
+ SyntaxKind::ListMarker if at_start => list_item(p),
+ SyntaxKind::EnumMarker if at_start => enum_item(p),
+ SyntaxKind::TermMarker if at_start => term_item(p),
SyntaxKind::RefMarker => reference(p),
SyntaxKind::Dollar => equation(p),
@@ -133,94 +114,76 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
| SyntaxKind::ListMarker
| SyntaxKind::EnumMarker
| SyntaxKind::TermMarker
- | SyntaxKind::Colon => p.convert(SyntaxKind::Text),
+ | SyntaxKind::Colon => p.convert_and_eat(SyntaxKind::Text),
- _ => {
- p.unexpected();
- return; // Don't set `at_start`
- }
+ _ => p.unexpected(),
}
-
- *at_start = false;
}
/// Parses strong content: `*Strong*`.
fn strong(p: &mut Parser) {
- let m = p.marker();
- p.assert(SyntaxKind::Star);
- markup(p, false, 0, |p| p.at_set(syntax_set!(Star, Parbreak, RightBracket)));
- p.expect_closing_delimiter(m, SyntaxKind::Star);
- p.wrap(m, SyntaxKind::Strong);
+ p.with_nl_mode(AtNewline::StopParBreak, |p| {
+ let m = p.marker();
+ p.assert(SyntaxKind::Star);
+ markup(p, false, true, syntax_set!(Star, RightBracket, End));
+ p.expect_closing_delimiter(m, SyntaxKind::Star);
+ p.wrap(m, SyntaxKind::Strong);
+ });
}
/// Parses emphasized content: `_Emphasized_`.
fn emph(p: &mut Parser) {
- let m = p.marker();
- p.assert(SyntaxKind::Underscore);
- markup(p, false, 0, |p| p.at_set(syntax_set!(Underscore, Parbreak, RightBracket)));
- p.expect_closing_delimiter(m, SyntaxKind::Underscore);
- p.wrap(m, SyntaxKind::Emph);
-}
-
-/// Parses raw text with optional syntax highlighting: `` `...` ``.
-fn raw(p: &mut Parser) {
- let m = p.marker();
- p.enter(LexMode::Raw);
- p.assert(SyntaxKind::RawDelim);
-
- // Eats until the closing delimiter.
- while !p.end() && !p.at(SyntaxKind::RawDelim) {
- p.eat();
- }
-
- p.expect(SyntaxKind::RawDelim);
- p.exit();
- p.wrap(m, SyntaxKind::Raw);
+ p.with_nl_mode(AtNewline::StopParBreak, |p| {
+ let m = p.marker();
+ p.assert(SyntaxKind::Underscore);
+ markup(p, false, true, syntax_set!(Underscore, RightBracket, End));
+ p.expect_closing_delimiter(m, SyntaxKind::Underscore);
+ p.wrap(m, SyntaxKind::Emph);
+ });
}
/// Parses a section heading: `= Introduction`.
fn heading(p: &mut Parser) {
- let m = p.marker();
- p.assert(SyntaxKind::HeadingMarker);
- whitespace_line(p);
- markup(p, false, usize::MAX, |p| {
- p.at_set(syntax_set!(Label, Space, RightBracket))
- && (!p.at(SyntaxKind::Space) || p.lexer.clone().next() == SyntaxKind::Label)
+ p.with_nl_mode(AtNewline::Stop, |p| {
+ let m = p.marker();
+ p.assert(SyntaxKind::HeadingMarker);
+ markup(p, false, false, syntax_set!(Label, RightBracket, End));
+ p.wrap(m, SyntaxKind::Heading);
});
- p.wrap(m, SyntaxKind::Heading);
}
/// Parses an item in a bullet list: `- ...`.
fn list_item(p: &mut Parser) {
- let m = p.marker();
- let min_indent = p.column(p.current_start()) + 1;
- p.assert(SyntaxKind::ListMarker);
- whitespace_line(p);
- markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
- p.wrap(m, SyntaxKind::ListItem);
+ p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+ let m = p.marker();
+ p.assert(SyntaxKind::ListMarker);
+ markup(p, false, false, syntax_set!(RightBracket, End));
+ p.wrap(m, SyntaxKind::ListItem);
+ });
}
/// Parses an item in an enumeration (numbered list): `+ ...` or `1. ...`.
fn enum_item(p: &mut Parser) {
- let m = p.marker();
- let min_indent = p.column(p.current_start()) + 1;
- p.assert(SyntaxKind::EnumMarker);
- whitespace_line(p);
- markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
- p.wrap(m, SyntaxKind::EnumItem);
+ p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+ let m = p.marker();
+ p.assert(SyntaxKind::EnumMarker);
+ markup(p, false, false, syntax_set!(RightBracket, End));
+ p.wrap(m, SyntaxKind::EnumItem);
+ });
}
/// Parses an item in a term list: `/ Term: Details`.
fn term_item(p: &mut Parser) {
- let m = p.marker();
- p.assert(SyntaxKind::TermMarker);
- let min_indent = p.column(p.prev_end());
- whitespace_line(p);
- markup(p, false, usize::MAX, |p| p.at_set(syntax_set!(Colon, RightBracket)));
- p.expect(SyntaxKind::Colon);
- whitespace_line(p);
- markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket));
- p.wrap(m, SyntaxKind::TermItem);
+ p.with_nl_mode(AtNewline::RequireColumn(p.current_column()), |p| {
+ let m = p.marker();
+ p.with_nl_mode(AtNewline::Stop, |p| {
+ p.assert(SyntaxKind::TermMarker);
+ markup(p, false, false, syntax_set!(Colon, RightBracket, End));
+ });
+ p.expect(SyntaxKind::Colon);
+ markup(p, false, false, syntax_set!(RightBracket, End));
+ p.wrap(m, SyntaxKind::TermItem);
+ });
}
/// Parses a reference: `@target`, `@target[..]`.
@@ -233,35 +196,34 @@ fn reference(p: &mut Parser) {
p.wrap(m, SyntaxKind::Ref);
}
-/// Consumes whitespace that does not contain a newline.
-fn whitespace_line(p: &mut Parser) {
- while !p.newline() && p.current().is_trivia() {
- p.eat();
- }
-}
-
/// Parses a mathematical equation: `$x$`, `$ x^2 $`.
fn equation(p: &mut Parser) {
let m = p.marker();
- p.enter(LexMode::Math);
- p.assert(SyntaxKind::Dollar);
- math(p, |p| p.at(SyntaxKind::Dollar));
- p.expect_closing_delimiter(m, SyntaxKind::Dollar);
- p.exit();
+ p.enter_modes(LexMode::Math, AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::Dollar);
+ math(p, syntax_set!(Dollar, RightBracket, End));
+ p.expect_closing_delimiter(m, SyntaxKind::Dollar);
+ });
p.wrap(m, SyntaxKind::Equation);
}
/// Parses the contents of a mathematical equation: `x^2 + 1`.
-fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
+fn math(p: &mut Parser, stop_set: SyntaxSet) {
let m = p.marker();
- while !p.end() && !stop(p) {
+ math_exprs(p, stop_set);
+ p.wrap(m, SyntaxKind::Math);
+}
+
+/// Parses a sequence of math expressions.
+fn math_exprs(p: &mut Parser, stop_set: SyntaxSet) {
+ debug_assert!(stop_set.contains(SyntaxKind::End));
+ while !p.at_set(stop_set) {
if p.at_set(set::MATH_EXPR) {
math_expr(p);
} else {
p.unexpected();
}
}
- p.wrap(m, SyntaxKind::Math);
}
/// Parses a single math expression: This includes math elements like
@@ -276,21 +238,11 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
let mut continuable = false;
match p.current() {
SyntaxKind::Hash => embedded_code_expr(p),
- SyntaxKind::MathIdent => {
+ // The lexer manages creating full FieldAccess nodes if needed.
+ SyntaxKind::MathIdent | SyntaxKind::FieldAccess => {
continuable = true;
p.eat();
- while p.directly_at(SyntaxKind::Text) && p.current_text() == "." && {
- let mut copy = p.lexer.clone();
- let start = copy.cursor();
- let next = copy.next();
- let end = copy.cursor();
- matches!(next, SyntaxKind::MathIdent | SyntaxKind::Text)
- && is_ident(&p.text[start..end])
- } {
- p.convert(SyntaxKind::Dot);
- p.convert(SyntaxKind::Ident);
- p.wrap(m, SyntaxKind::FieldAccess);
- }
+ // Parse a function call for an identifier or field access.
if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "("
{
math_args(p);
@@ -340,11 +292,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
_ => p.expected("expression"),
}
- if continuable
- && min_prec < 3
- && p.prev_end() == p.current_start()
- && maybe_delimited(p)
- {
+ if continuable && min_prec < 3 && !p.had_trivia() && maybe_delimited(p) {
p.wrap(m, SyntaxKind::Math);
}
@@ -414,6 +362,23 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
}
}
+/// Precedence and wrapper kinds for the binary math operators.
+fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
+ match kind {
+ SyntaxKind::Underscore => {
+ Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2))
+ }
+ SyntaxKind::Hat => {
+ Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2))
+ }
+ SyntaxKind::Slash => {
+ Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1))
+ }
+ _ => None,
+ }
+}
+
+/// Try to parse delimiters based on the current token's unicode math class.
fn maybe_delimited(p: &mut Parser) -> bool {
let open = math_class(p.current_text()) == Some(MathClass::Opening);
if open {
@@ -422,11 +387,12 @@ fn maybe_delimited(p: &mut Parser) -> bool {
open
}
+/// Parse matched delimiters in math: `[x + y]`.
fn math_delimited(p: &mut Parser) {
let m = p.marker();
p.eat();
let m2 = p.marker();
- while !p.end() && !p.at(SyntaxKind::Dollar) {
+ while !p.at_set(syntax_set!(Dollar, End)) {
if math_class(p.current_text()) == Some(MathClass::Closing) {
p.wrap(m2, SyntaxKind::Math);
p.eat();
@@ -444,6 +410,8 @@ fn math_delimited(p: &mut Parser) {
p.wrap(m, SyntaxKind::Math);
}
+/// Remove one set of parentheses (if any) from a previously parsed expression
+/// by converting to non-expression SyntaxKinds.
fn math_unparen(p: &mut Parser, m: Marker) {
let Some(node) = p.nodes.get_mut(m.0) else { return };
if node.kind() != SyntaxKind::MathDelimited {
@@ -460,6 +428,10 @@ fn math_unparen(p: &mut Parser, m: Marker) {
node.convert_to_kind(SyntaxKind::Math);
}
+/// The unicode math class of a string. Only returns `Some` if `text` has
+/// exactly one unicode character or is a math shorthand string (currently just
+/// `[|`, `||`, `|]`) and then only returns `Some` if there is a math class
+/// defined for that character.
fn math_class(text: &str) -> Option<MathClass> {
match text {
"[|" => return Some(MathClass::Opening),
@@ -475,38 +447,26 @@ fn math_class(text: &str) -> Option<MathClass> {
.and_then(unicode_math_class::class)
}
-fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
- match kind {
- SyntaxKind::Underscore => {
- Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2))
- }
- SyntaxKind::Hat => {
- Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2))
- }
- SyntaxKind::Slash => {
- Some((SyntaxKind::MathFrac, SyntaxKind::End, ast::Assoc::Left, 1))
- }
- _ => None,
- }
-}
-
+/// Parse an argument list in math: `(a, b; c, d; size: #50%)`.
fn math_args(p: &mut Parser) {
let m = p.marker();
- p.convert(SyntaxKind::LeftParen);
+ p.convert_and_eat(SyntaxKind::LeftParen);
let mut namable = true;
let mut named = None;
let mut has_arrays = false;
let mut array = p.marker();
let mut arg = p.marker();
+ // The number of math expressions per argument.
+ let mut count = 0;
- while !p.end() && !p.at(SyntaxKind::Dollar) {
+ while !p.at_set(syntax_set!(Dollar, End)) {
if namable
&& (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text))
&& p.text[p.current_end()..].starts_with(':')
{
- p.convert(SyntaxKind::Ident);
- p.convert(SyntaxKind::Colon);
+ p.convert_and_eat(SyntaxKind::Ident);
+ p.convert_and_eat(SyntaxKind::Colon);
named = Some(arg);
arg = p.marker();
array = p.marker();
@@ -515,20 +475,22 @@ fn math_args(p: &mut Parser) {
match p.current_text() {
")" => break,
";" => {
- maybe_wrap_in_math(p, arg, named);
+ maybe_wrap_in_math(p, arg, count, named);
p.wrap(array, SyntaxKind::Array);
- p.convert(SyntaxKind::Semicolon);
+ p.convert_and_eat(SyntaxKind::Semicolon);
array = p.marker();
arg = p.marker();
+ count = 0;
namable = true;
named = None;
has_arrays = true;
continue;
}
"," => {
- maybe_wrap_in_math(p, arg, named);
- p.convert(SyntaxKind::Comma);
+ maybe_wrap_in_math(p, arg, count, named);
+ p.convert_and_eat(SyntaxKind::Comma);
arg = p.marker();
+ count = 0;
namable = true;
if named.is_some() {
array = p.marker();
@@ -541,6 +503,7 @@ fn math_args(p: &mut Parser) {
if p.at_set(set::MATH_EXPR) {
math_expr(p);
+ count += 1;
} else {
p.unexpected();
}
@@ -549,7 +512,7 @@ fn math_args(p: &mut Parser) {
}
if arg != p.marker() {
- maybe_wrap_in_math(p, arg, named);
+ maybe_wrap_in_math(p, arg, count, named);
if named.is_some() {
array = p.marker();
}
@@ -560,7 +523,7 @@ fn math_args(p: &mut Parser) {
}
if p.at(SyntaxKind::Text) && p.current_text() == ")" {
- p.convert(SyntaxKind::RightParen);
+ p.convert_and_eat(SyntaxKind::RightParen);
} else {
p.expected("closing paren");
p.balanced = false;
@@ -569,23 +532,26 @@ fn math_args(p: &mut Parser) {
p.wrap(m, SyntaxKind::Args);
}
-/// Wrap math function arguments in a "Math" SyntaxKind to combine adjacent expressions
-/// or create blank content.
-///
-/// We don't wrap when `exprs == 1`, as there is only one expression, so the grouping
-/// isn't needed, and this would change the type of the expression from potentially
-/// non-content to content.
+/// Wrap math function arguments to join adjacent math content or create an
+/// empty 'Math' node for when we have 0 args.
///
-/// Note that `exprs` might be 0 if we have whitespace or trivia before a comma i.e.
-/// `mat(; ,)` or `sin(x, , , ,)`. This would create an empty Math element before that
-/// trivia if we called `p.wrap()` -- breaking the expected AST for 2-d arguments -- so
-/// we instead manually wrap to our current marker using `p.wrap_within()`.
-fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) {
- let exprs = p.post_process(arg).filter(|node| node.is::<ast::Expr>()).count();
- if exprs != 1 {
- // Convert 0 exprs into a blank math element (so empty arguments are allowed).
- // Convert 2+ exprs into a math element (so they become a joined sequence).
- p.wrap_within(arg, p.marker(), SyntaxKind::Math);
+/// We don't wrap when `count == 1`, since wrapping would change the type of the
+/// expression from potentially non-content to content. Ex: `$ func(#12pt) $`
+/// would change the type from size to content if wrapped.
+fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, count: usize, named: Option<Marker>) {
+ if count == 0 {
+ // Flush trivia so that the new empty Math node will be wrapped _inside_
+ // any `SyntaxKind::Array` elements created in `math_args`.
+ // (And if we don't follow by wrapping in an array, it has no effect.)
+ // The difference in node layout without this would look like:
+ // Expression: `$ mat( ;) $`
+ // - Correct: [ .., Space(" "), Array[Math[], ], Semicolon(";"), .. ]
+ // - Incorrect: [ .., Math[], Array[], Space(" "), Semicolon(";"), .. ]
+ p.flush_trivia();
+ }
+
+ if count != 1 {
+ p.wrap(arg, SyntaxKind::Math);
}
if let Some(m) = named {
@@ -594,66 +560,63 @@ fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) {
}
/// Parses the contents of a code block.
-fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) {
+fn code(p: &mut Parser, stop_set: SyntaxSet) {
let m = p.marker();
- code_exprs(p, stop);
+ code_exprs(p, stop_set);
p.wrap(m, SyntaxKind::Code);
}
/// Parses a sequence of code expressions.
-fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) {
- while !p.end() && !stop(p) {
- p.enter_newline_mode(NewlineMode::Contextual);
-
- let at_expr = p.at_set(set::CODE_EXPR);
- if at_expr {
+fn code_exprs(p: &mut Parser, stop_set: SyntaxSet) {
+ debug_assert!(stop_set.contains(SyntaxKind::End));
+ while !p.at_set(stop_set) {
+ p.with_nl_mode(AtNewline::ContextualContinue, |p| {
+ if !p.at_set(set::CODE_EXPR) {
+ p.unexpected();
+ return;
+ }
code_expr(p);
- if !p.end() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) {
+ if !p.at_set(stop_set) && !p.eat_if(SyntaxKind::Semicolon) {
p.expected("semicolon or line break");
if p.at(SyntaxKind::Label) {
p.hint("labels can only be applied in markup mode");
p.hint("try wrapping your code in a markup block (`[ ]`)");
}
}
- }
-
- p.exit_newline_mode();
- if !at_expr && !p.end() {
- p.unexpected();
- }
+ });
}
}
-/// Parses a single code expression.
-fn code_expr(p: &mut Parser) {
- code_expr_prec(p, false, 0)
-}
-
-/// Parses a code expression embedded in markup or math.
+/// Parses an atomic code expression embedded in markup or math.
fn embedded_code_expr(p: &mut Parser) {
- p.enter_newline_mode(NewlineMode::Stop);
- p.enter(LexMode::Code);
- p.assert(SyntaxKind::Hash);
- p.unskip();
+ p.enter_modes(LexMode::Code, AtNewline::Stop, |p| {
+ p.assert(SyntaxKind::Hash);
+ if p.had_trivia() || p.end() {
+ p.expected("expression");
+ return;
+ }
- let stmt = p.at_set(set::STMT);
- let at = p.at_set(set::ATOMIC_CODE_EXPR);
- code_expr_prec(p, true, 0);
+ let stmt = p.at_set(set::STMT);
+ let at = p.at_set(set::ATOMIC_CODE_EXPR);
+ code_expr_prec(p, true, 0);
- // Consume error for things like `#12p` or `#"abc\"`.#
- if !at && !p.current().is_trivia() && !p.end() {
- p.unexpected();
- }
+ // Consume error for things like `#12p` or `#"abc\"`.#
+ if !at {
+ p.unexpected();
+ }
- let semi =
- (stmt || p.directly_at(SyntaxKind::Semicolon)) && p.eat_if(SyntaxKind::Semicolon);
+ let semi = (stmt || p.directly_at(SyntaxKind::Semicolon))
+ && p.eat_if(SyntaxKind::Semicolon);
- if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
- p.expected("semicolon or line break");
- }
+ if stmt && !semi && !p.end() && !p.at(SyntaxKind::RightBracket) {
+ p.expected("semicolon or line break");
+ }
+ });
+}
- p.exit();
- p.exit_newline_mode();
+/// Parses a single code expression.
+fn code_expr(p: &mut Parser) {
+ code_expr_prec(p, false, 0)
}
/// Parses a code expression with at least the given precedence.
@@ -676,8 +639,8 @@ fn code_expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) {
continue;
}
- let at_field_or_method =
- p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident;
+ let at_field_or_method = p.directly_at(SyntaxKind::Dot)
+ && p.lexer.clone().next().0 == SyntaxKind::Ident;
if atomic && !at_field_or_method {
break;
@@ -757,7 +720,6 @@ fn code_primary(p: &mut Parser, atomic: bool) {
SyntaxKind::LeftBrace => code_block(p),
SyntaxKind::LeftBracket => content_block(p),
SyntaxKind::LeftParen => expr_with_paren(p, atomic),
- SyntaxKind::RawDelim => raw(p),
SyntaxKind::Dollar => equation(p),
SyntaxKind::Let => let_binding(p),
SyntaxKind::Set => set_rule(p),
@@ -772,6 +734,8 @@ fn code_primary(p: &mut Parser, atomic: bool) {
SyntaxKind::Continue => continue_stmt(p),
SyntaxKind::Return => return_stmt(p),
+ SyntaxKind::Raw => p.eat(), // Raw is handled entirely in the Lexer.
+
SyntaxKind::None
| SyntaxKind::Auto
| SyntaxKind::Int
@@ -785,15 +749,6 @@ fn code_primary(p: &mut Parser, atomic: bool) {
}
}
-/// Parses a content or code block.
-fn block(p: &mut Parser) {
- match p.current() {
- SyntaxKind::LeftBracket => content_block(p),
- SyntaxKind::LeftBrace => code_block(p),
- _ => p.expected("block"),
- }
-}
-
/// Reparses a full content or code block.
pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNode> {
let mut p = Parser::new(text, range.start, LexMode::Code);
@@ -803,27 +758,34 @@ pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNod
.then(|| p.finish().into_iter().next().unwrap())
}
+/// Parses a content or code block.
+fn block(p: &mut Parser) {
+ match p.current() {
+ SyntaxKind::LeftBracket => content_block(p),
+ SyntaxKind::LeftBrace => code_block(p),
+ _ => p.expected("block"),
+ }
+}
+
/// Parses a code block: `{ let x = 1; x + 2 }`.
fn code_block(p: &mut Parser) {
let m = p.marker();
- p.enter(LexMode::Code);
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftBrace);
- code(p, |p| p.at_set(syntax_set!(RightBrace, RightBracket, RightParen)));
- p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
- p.exit();
- p.exit_newline_mode();
+ p.enter_modes(LexMode::Code, AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftBrace);
+ code(p, syntax_set!(RightBrace, RightBracket, RightParen, End));
+ p.expect_closing_delimiter(m, SyntaxKind::RightBrace);
+ });
p.wrap(m, SyntaxKind::CodeBlock);
}
/// Parses a content block: `[*Hi* there!]`.
fn content_block(p: &mut Parser) {
let m = p.marker();
- p.enter(LexMode::Markup);
- p.assert(SyntaxKind::LeftBracket);
- markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket));
- p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
- p.exit();
+ p.enter_modes(LexMode::Markup, AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftBracket);
+ markup(p, true, true, syntax_set!(RightBracket, End));
+ p.expect_closing_delimiter(m, SyntaxKind::RightBracket);
+ });
p.wrap(m, SyntaxKind::ContentBlock);
}
@@ -937,9 +899,8 @@ fn for_loop(p: &mut Parser) {
let mut seen = HashSet::new();
pattern(p, false, &mut seen, None);
- let m2 = p.marker();
- if p.eat_if(SyntaxKind::Comma) {
- let node = &mut p[m2];
+ if p.at(SyntaxKind::Comma) {
+ let node = p.eat_and_get();
node.unexpected();
node.hint("destructuring patterns must be wrapped in parentheses");
if p.at_set(set::PATTERN) {
@@ -967,14 +928,14 @@ fn module_import(p: &mut Parser) {
if p.eat_if(SyntaxKind::Colon) {
if p.at(SyntaxKind::LeftParen) {
- let m1 = p.marker();
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftParen);
+ p.with_nl_mode(AtNewline::Continue, |p| {
+ let m2 = p.marker();
+ p.assert(SyntaxKind::LeftParen);
- import_items(p);
+ import_items(p);
- p.expect_closing_delimiter(m1, SyntaxKind::RightParen);
- p.exit_newline_mode();
+ p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
+ });
} else if !p.eat_if(SyntaxKind::Star) {
import_items(p);
}
@@ -1047,29 +1008,25 @@ fn return_stmt(p: &mut Parser) {
/// An expression that starts with a parenthesis.
fn expr_with_paren(p: &mut Parser, atomic: bool) {
- // If we've seen this position before and have a memoized result, just use
- // it. See below for more explanation about this memoization.
- let start = p.current_start();
- if let Some((range, end_point)) = p.memo.get(&start).cloned() {
- // Restore the end point first, so that it doesn't truncate our freshly
- // pushed nodes. If the current length of `p.nodes` doesn't match what
- // we had in the memoized run, this might otherwise happen.
- p.restore(end_point);
- p.nodes.extend(p.memo_arena[range].iter().cloned());
+ if atomic {
+ // Atomic expressions aren't modified by operators that follow them, so
+ // our first guess of array/dict will be correct.
+ parenthesized_or_array_or_dict(p);
return;
}
- let m = p.marker();
- let checkpoint = p.checkpoint();
+ // If we've seen this position before and have a memoized result, restore it
+ // and return. Otherwise, get a key to this position and a checkpoint to
+ // restart from in case we make a wrong prediction.
+ let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return };
+ // The node length from when we restored.
+ let prev_len = checkpoint.node_len;
// When we reach a '(', we can't be sure what it is. First, we attempt to
// parse as a simple parenthesized expression, array, or dictionary as
// these are the most likely things. We can handle all of those in a single
// pass.
let kind = parenthesized_or_array_or_dict(p);
- if atomic {
- return;
- }
// If, however, '=>' or '=' follows, we must backtrack and reparse as either
// a parameter list or a destructuring. To be able to do that, we created a
@@ -1090,6 +1047,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
// case running time of O(2n).
if p.at(SyntaxKind::Arrow) {
p.restore(checkpoint);
+ let m = p.marker();
params(p);
if !p.expect(SyntaxKind::Arrow) {
return;
@@ -1098,6 +1056,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
p.wrap(m, SyntaxKind::Closure);
} else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
p.restore(checkpoint);
+ let m = p.marker();
destructuring_or_parenthesized(p, true, &mut HashSet::new());
if !p.expect(SyntaxKind::Eq) {
return;
@@ -1109,9 +1068,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
}
// Memoize result if we backtracked.
- let offset = p.memo_arena.len();
- p.memo_arena.extend(p.nodes[m.0..].iter().cloned());
- p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint()));
+ p.memoize_parsed_nodes(memo_key, prev_len);
}
/// Parses either
@@ -1119,10 +1076,6 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
/// - an array: `(1, "hi", 12cm)`, or
/// - a dictionary: `(thickness: 3pt, pattern: dashed)`.
fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
- let m = p.marker();
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftParen);
-
let mut state = GroupState {
count: 0,
maybe_just_parens: true,
@@ -1130,27 +1083,44 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
seen: HashSet::new(),
};
- if p.eat_if(SyntaxKind::Colon) {
- state.kind = Some(SyntaxKind::Dict);
- state.maybe_just_parens = false;
- }
-
- while !p.current().is_terminator() {
- if !p.at_set(set::ARRAY_OR_DICT_ITEM) {
- p.unexpected();
- continue;
+ // An edge case with parens is whether we can interpret a leading spread
+ // expression as a dictionary, e.g. if we want `(..dict1, ..dict2)` to join
+ // the two dicts.
+ //
+ // The issue is that we decide on the type of the parenthesized expression
+ // here in the parser by the `SyntaxKind` we wrap with, instead of in eval
+ // based on the type of the spread item.
+ //
+ // The current fix is that we allow a leading colon to force the
+ // parenthesized value into a dict:
+ // - `(..arr1, ..arr2)` is wrapped as an `Array`.
+ // - `(: ..dict1, ..dict2)` is wrapped as a `Dict`.
+ //
+ // This does allow some unexpected expressions, such as `(: key: val)`, but
+ // it's currently intentional.
+ let m = p.marker();
+ p.with_nl_mode(AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftParen);
+ if p.eat_if(SyntaxKind::Colon) {
+ state.kind = Some(SyntaxKind::Dict);
}
- array_or_dict_item(p, &mut state);
- state.count += 1;
+ while !p.current().is_terminator() {
+ if !p.at_set(set::ARRAY_OR_DICT_ITEM) {
+ p.unexpected();
+ continue;
+ }
+
+ array_or_dict_item(p, &mut state);
+ state.count += 1;
- if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
- state.maybe_just_parens = false;
+ if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
+ state.maybe_just_parens = false;
+ }
}
- }
- p.expect_closing_delimiter(m, SyntaxKind::RightParen);
- p.exit_newline_mode();
+ p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+ });
let kind = if state.maybe_just_parens && state.count == 1 {
SyntaxKind::Parenthesized
@@ -1165,8 +1135,13 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
/// State for array/dictionary parsing.
struct GroupState {
count: usize,
+ /// Whether this is just a single expression in parens: `(a)`. Single
+ /// element arrays require an explicit comma: `(a,)`, unless we're
+ /// spreading: `(..a)`.
maybe_just_parens: bool,
+ /// The `SyntaxKind` to wrap as (if we've figured it out yet).
kind: Option<SyntaxKind>,
+ /// Store named arguments so we can give an error if they're repeated.
seen: HashSet<EcoString>,
}
@@ -1234,25 +1209,25 @@ fn args(p: &mut Parser) {
let m = p.marker();
if p.at(SyntaxKind::LeftParen) {
let m2 = p.marker();
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftParen);
+ p.with_nl_mode(AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftParen);
- let mut seen = HashSet::new();
- while !p.current().is_terminator() {
- if !p.at_set(set::ARG) {
- p.unexpected();
- continue;
- }
+ let mut seen = HashSet::new();
+ while !p.current().is_terminator() {
+ if !p.at_set(set::ARG) {
+ p.unexpected();
+ continue;
+ }
- arg(p, &mut seen);
+ arg(p, &mut seen);
- if !p.current().is_terminator() {
- p.expect(SyntaxKind::Comma);
+ if !p.current().is_terminator() {
+ p.expect(SyntaxKind::Comma);
+ }
}
- }
- p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
- p.exit_newline_mode();
+ p.expect_closing_delimiter(m2, SyntaxKind::RightParen);
+ });
}
while p.directly_at(SyntaxKind::LeftBracket) {
@@ -1297,27 +1272,27 @@ fn arg<'s>(p: &mut Parser<'s>, seen: &mut HashSet<&'s str>) {
/// Parses a closure's parameters: `(x, y)`.
fn params(p: &mut Parser) {
let m = p.marker();
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftParen);
+ p.with_nl_mode(AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftParen);
- let mut seen = HashSet::new();
- let mut sink = false;
+ let mut seen = HashSet::new();
+ let mut sink = false;
- while !p.current().is_terminator() {
- if !p.at_set(set::PARAM) {
- p.unexpected();
- continue;
- }
+ while !p.current().is_terminator() {
+ if !p.at_set(set::PARAM) {
+ p.unexpected();
+ continue;
+ }
- param(p, &mut seen, &mut sink);
+ param(p, &mut seen, &mut sink);
- if !p.current().is_terminator() {
- p.expect(SyntaxKind::Comma);
+ if !p.current().is_terminator() {
+ p.expect(SyntaxKind::Comma);
+ }
}
- }
- p.expect_closing_delimiter(m, SyntaxKind::RightParen);
- p.exit_newline_mode();
+ p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+ });
p.wrap(m, SyntaxKind::Params);
}
@@ -1378,25 +1353,25 @@ fn destructuring_or_parenthesized<'s>(
let mut maybe_just_parens = true;
let m = p.marker();
- p.enter_newline_mode(NewlineMode::Continue);
- p.assert(SyntaxKind::LeftParen);
+ p.with_nl_mode(AtNewline::Continue, |p| {
+ p.assert(SyntaxKind::LeftParen);
- while !p.current().is_terminator() {
- if !p.at_set(set::DESTRUCTURING_ITEM) {
- p.unexpected();
- continue;
- }
+ while !p.current().is_terminator() {
+ if !p.at_set(set::DESTRUCTURING_ITEM) {
+ p.unexpected();
+ continue;
+ }
- destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink);
- count += 1;
+ destructuring_item(p, reassignment, seen, &mut maybe_just_parens, &mut sink);
+ count += 1;
- if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
- maybe_just_parens = false;
+ if !p.current().is_terminator() && p.expect(SyntaxKind::Comma) {
+ maybe_just_parens = false;
+ }
}
- }
- p.expect_closing_delimiter(m, SyntaxKind::RightParen);
- p.exit_newline_mode();
+ p.expect_closing_delimiter(m, SyntaxKind::RightParen);
+ });
if maybe_just_parens && count == 1 && !sink {
p.wrap(m, SyntaxKind::Parenthesized);
@@ -1429,6 +1404,9 @@ fn destructuring_item<'s>(
// Parse a normal positional pattern or a destructuring key.
let was_at_pat = p.at_set(set::PATTERN);
+
+ // We must use a full checkpoint here (can't just clone the lexer) because
+ // there may be trivia between the identifier and the colon we need to skip.
let checkpoint = p.checkpoint();
if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
p.restore(checkpoint);
@@ -1487,122 +1465,274 @@ fn pattern_leaf<'s>(
}
}
-/// Manages parsing of a stream of tokens.
+/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s.
+///
+/// The implementation presents an interface that investigates a current `token`
+/// with a [`SyntaxKind`] and can take one of the following actions:
+///
+/// 1. Eat a token: push `token` onto the `nodes` vector as a [leaf
+/// node](`SyntaxNode::leaf`) and prepare a new `token` by calling into the
+/// lexer.
+/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `token` and any
+/// attached trivia) into an [inner node](`SyntaxNode::inner`) of a specific
+/// `SyntaxKind`.
+/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when
+/// something expected is missing or something unexpected is found.
+///
+/// Overall the parser produces a nested tree of SyntaxNodes as a "_Concrete_
+/// Syntax Tree." The raw Concrete Syntax Tree should contain the entire source
+/// text, and is used as-is for e.g. syntax highlighting and IDE features. In
+/// `ast.rs` the CST is interpreted as a lazy view over an "_Abstract_ Syntax
+/// Tree." The AST module skips over irrelevant tokens -- whitespace, comments,
+/// code parens, commas in function args, etc. -- as it iterates through the
+/// tree.
+///
+/// ### Modes
+///
+/// The parser manages the transitions between the three modes of Typst through
+/// [lexer modes](`LexMode`) and [newline modes](`AtNewline`).
+///
+/// The lexer modes map to the three Typst modes and are stored in the lexer,
+/// changing which`SyntaxKind`s it will generate.
+///
+/// The newline mode is used to determine whether a newline should end the
+/// current expression. If so, the parser temporarily changes `token`'s kind to
+/// a fake [`SyntaxKind::End`]. When the parser exits the mode the original
+/// `SyntaxKind` is restored.
struct Parser<'s> {
+ /// The source text shared with the lexer.
text: &'s str,
+ /// A lexer over the source text with multiple modes. Defines the boundaries
+ /// of tokens and determines their [`SyntaxKind`]. Contains the [`LexMode`]
+ /// defining our current Typst mode.
lexer: Lexer<'s>,
- prev_end: usize,
- current_start: usize,
- current: SyntaxKind,
+ /// The newline mode: whether to insert a temporary end at newlines.
+ nl_mode: AtNewline,
+ /// The current token under inspection, not yet present in `nodes`. This
+ /// acts like a single item of lookahead for the parser.
+ ///
+ /// When wrapping, this is _not_ included in the wrapped nodes.
+ token: Token,
+ /// Whether the parser has the expected set of open/close delimiters. This
+ /// only ever transitions from `true` to `false`.
balanced: bool,
+ /// Nodes representing the concrete syntax tree of previously parsed text.
+ /// In Code and Math, includes previously parsed trivia, but not `token`.
nodes: Vec<SyntaxNode>,
- modes: Vec<LexMode>,
- newline_modes: Vec<NewlineMode>,
- memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
- memo_arena: Vec<SyntaxNode>,
+ /// Parser checkpoints for a given text index. Used for efficient parser
+ /// backtracking similar to packrat parsing. See comments above in
+ /// [`expr_with_paren`].
+ memo: MemoArena,
+}
+
+/// A single token returned from the lexer with a cached [`SyntaxKind`] and a
+/// record of preceding trivia.
+#[derive(Debug, Clone)]
+struct Token {
+ /// The [`SyntaxKind`] of the current token.
+ kind: SyntaxKind,
+ /// The [`SyntaxNode`] of the current token, ready to be eaten and pushed
+ /// onto the end of `nodes`.
+ node: SyntaxNode,
+ /// The number of preceding trivia before this token.
+ n_trivia: usize,
+ /// Whether this token's preceding trivia contained a newline.
+ newline: Option<Newline>,
+ /// The index into `text` of the start of our current token (the end is
+ /// stored as the lexer's cursor).
+ start: usize,
+ /// The index into `text` of the end of the previous token.
+ prev_end: usize,
}
-/// How to proceed with parsing when seeing a newline.
-#[derive(Clone)]
-enum NewlineMode {
- /// Stop always.
- Stop,
- /// Proceed if there is no continuation with `else` or `.`
- Contextual,
- /// Just proceed like with normal whitespace.
- Continue,
+/// Information about a newline if present (currently only relevant in Markup).
+#[derive(Debug, Clone, Copy)]
+struct Newline {
+ /// The column of the start of our token in its line.
+ column: Option<usize>,
+ /// Whether any of our newlines were paragraph breaks.
+ parbreak: bool,
}
+/// How to proceed with parsing when at a newline.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum AtNewline {
+ /// Continue at newlines.
+ Continue,
+ /// Stop at any newline.
+ Stop,
+ /// Continue only if there is no continuation with `else` or `.` (Code only).
+ ContextualContinue,
+ /// Stop only at a parbreak, not normal newlines (Markup only).
+ StopParBreak,
+ /// Require that the token's column be greater or equal to a column (Markup
+ /// only). If this is `0`, acts like `Continue`; if this is `usize::MAX`,
+ /// acts like `Stop`.
+ RequireColumn(usize),
+}
+
+impl AtNewline {
+ /// Whether to stop at a newline or continue based on the current context.
+ fn stop_at(self, Newline { column, parbreak }: Newline, kind: SyntaxKind) -> bool {
+ #[allow(clippy::match_like_matches_macro)]
+ match self {
+ AtNewline::Continue => false,
+ AtNewline::Stop => true,
+ AtNewline::ContextualContinue => match kind {
+ SyntaxKind::Else | SyntaxKind::Dot => false,
+ _ => true,
+ },
+ AtNewline::StopParBreak => parbreak,
+ AtNewline::RequireColumn(min_col) => match column {
+ Some(column) => column <= min_col,
+ None => false, // Don't stop if we had no column.
+ },
+ }
+ }
+}
+
+/// A marker representing a node's position in the parser. Mainly used for
+/// wrapping, but can also index into the parser to access the node, like
+/// `p[m]`.
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
struct Marker(usize);
-#[derive(Clone)]
-struct Checkpoint<'s> {
- lexer: Lexer<'s>,
- prev_end: usize,
- current_start: usize,
- current: SyntaxKind,
- nodes: usize,
+// Index into the parser with markers.
+impl Index<Marker> for Parser<'_> {
+ type Output = SyntaxNode;
+
+ fn index(&self, m: Marker) -> &Self::Output {
+ &self.nodes[m.0]
+ }
}
+impl IndexMut<Marker> for Parser<'_> {
+ fn index_mut(&mut self, m: Marker) -> &mut Self::Output {
+ &mut self.nodes[m.0]
+ }
+}
+
+/// Creating/Consuming the parser and getting info about the current token.
impl<'s> Parser<'s> {
+ /// Create a new parser starting from the given text offset and lexer mode.
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
let mut lexer = Lexer::new(text, mode);
lexer.jump(offset);
- let current = lexer.next();
+ let nl_mode = AtNewline::Continue;
+ let mut nodes = vec![];
+ let token = Self::lex(&mut nodes, &mut lexer, nl_mode);
Self {
- lexer,
text,
- prev_end: offset,
- current_start: offset,
- current,
+ lexer,
+ nl_mode,
+ token,
balanced: true,
- nodes: vec![],
- modes: vec![],
- newline_modes: vec![],
- memo: HashMap::new(),
- memo_arena: vec![],
+ nodes,
+ memo: Default::default(),
}
}
+ /// Consume the parser, yielding the full vector of parsed SyntaxNodes.
fn finish(self) -> Vec<SyntaxNode> {
self.nodes
}
- fn prev_end(&self) -> usize {
- self.prev_end
+ /// Consume the parser, generating a single top-level node.
+ fn finish_into(self, kind: SyntaxKind) -> SyntaxNode {
+ assert!(self.at(SyntaxKind::End));
+ SyntaxNode::inner(kind, self.finish())
}
+ /// Similar to a `peek()` function: returns the `kind` of the next token to
+ /// be eaten.
fn current(&self) -> SyntaxKind {
- self.current
+ self.token.kind
}
- fn current_start(&self) -> usize {
- self.current_start
+ /// Whether the current token is a given [`SyntaxKind`].
+ fn at(&self, kind: SyntaxKind) -> bool {
+ self.token.kind == kind
}
- fn current_end(&self) -> usize {
- self.lexer.cursor()
+ /// Whether the current token is contained in a [`SyntaxSet`].
+ fn at_set(&self, set: SyntaxSet) -> bool {
+ set.contains(self.token.kind)
+ }
+
+ /// Whether we're at the end of the token stream.
+ ///
+ /// Note: This might be a fake end due to the newline mode.
+ fn end(&self) -> bool {
+ self.at(SyntaxKind::End)
+ }
+
+ /// If we're at the given `kind` with no preceding trivia tokens.
+ fn directly_at(&self, kind: SyntaxKind) -> bool {
+ self.token.kind == kind && !self.had_trivia()
}
+ /// Whether `token` had any preceding trivia.
+ fn had_trivia(&self) -> bool {
+ self.token.n_trivia > 0
+ }
+
+ /// Whether `token` had a newline among any of its preceding trivia.
+ fn had_newline(&self) -> bool {
+ self.token.newline.is_some()
+ }
+
+ /// The number of characters until the most recent newline from the current
+ /// token, or 0 if it did not follow a newline.
+ fn current_column(&self) -> usize {
+ self.token.newline.and_then(|newline| newline.column).unwrap_or(0)
+ }
+
+ /// The current token's text.
fn current_text(&self) -> &'s str {
- &self.text[self.current_start..self.current_end()]
+ &self.text[self.token.start..self.current_end()]
}
- fn at(&self, kind: SyntaxKind) -> bool {
- self.current == kind
+ /// The offset into `text` of the current token's start.
+ fn current_start(&self) -> usize {
+ self.token.start
}
- fn at_set(&self, set: SyntaxSet) -> bool {
- set.contains(self.current)
+ /// The offset into `text` of the current token's end.
+ fn current_end(&self) -> usize {
+ self.lexer.cursor()
}
- fn end(&self) -> bool {
- self.at(SyntaxKind::End)
+ /// The offset into `text` of the previous token's end.
+ fn prev_end(&self) -> usize {
+ self.token.prev_end
}
+}
- fn directly_at(&self, kind: SyntaxKind) -> bool {
- self.current == kind && self.prev_end == self.current_start
+// The main parsing interface for generating tokens and eating/modifying nodes.
+impl<'s> Parser<'s> {
+ /// A marker that will point to the current token in the parser once it's
+ /// been eaten.
+ fn marker(&self) -> Marker {
+ Marker(self.nodes.len())
}
- fn eat(&mut self) {
- self.save();
- self.lex();
- self.skip();
+ /// A marker that will point to first trivia before this token in the
+ /// parser (or the token itself if no trivia precede it).
+ fn before_trivia(&self) -> Marker {
+ Marker(self.nodes.len() - self.token.n_trivia)
}
+ /// Eat the current node and return a reference for in-place mutation.
#[track_caller]
fn eat_and_get(&mut self) -> &mut SyntaxNode {
let offset = self.nodes.len();
- self.save();
- self.lex();
- self.skip();
+ self.eat();
&mut self.nodes[offset]
}
- /// Eats if at `kind`.
+ /// Eat the token if at `kind`. Returns `true` if eaten.
///
- /// Note: In math and code mode, this will ignore trivia in front of the
+ /// Note: In Math and Code, this will ignore trivia in front of the
/// `kind`, To forbid skipping trivia, consider using `eat_if_direct`.
fn eat_if(&mut self, kind: SyntaxKind) -> bool {
let at = self.at(kind);
@@ -1612,7 +1742,8 @@ impl<'s> Parser<'s> {
at
}
- /// Eats only if currently at the start of `kind`.
+ /// Eat the token only if at `kind` with no preceding trivia. Returns `true`
+ /// if eaten.
fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool {
let at = self.directly_at(kind);
if at {
@@ -1621,188 +1752,228 @@ impl<'s> Parser<'s> {
at
}
+ /// Assert that we are at the given [`SyntaxKind`] and eat it. This should
+ /// be used when moving between functions that expect to start with a
+ /// specific token.
#[track_caller]
fn assert(&mut self, kind: SyntaxKind) {
- assert_eq!(self.current, kind);
+ assert_eq!(self.token.kind, kind);
self.eat();
}
- fn convert(&mut self, kind: SyntaxKind) {
- self.current = kind;
+ /// Convert the current token's [`SyntaxKind`] and eat it.
+ fn convert_and_eat(&mut self, kind: SyntaxKind) {
+ // Only need to replace the node here.
+ self.token.node.convert_to_kind(kind);
self.eat();
}
- fn newline(&mut self) -> bool {
- self.lexer.newline()
- }
-
- fn column(&self, at: usize) -> usize {
- self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
- }
-
- fn marker(&self) -> Marker {
- Marker(self.nodes.len())
- }
-
- /// Get a marker after the last non-trivia node.
- fn before_trivia(&self) -> Marker {
- let mut i = self.nodes.len();
- if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
- while i > 0 && self.nodes[i - 1].kind().is_trivia() {
- i -= 1;
- }
- }
- Marker(i)
- }
-
- /// Whether the last non-trivia node is an error.
- fn after_error(&mut self) -> bool {
- let m = self.before_trivia();
- m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
+ /// Eat the current token by saving it to the `nodes` vector, then move
+ /// the lexer forward to prepare a new token.
+ fn eat(&mut self) {
+ self.nodes.push(std::mem::take(&mut self.token.node));
+ self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
}
- #[track_caller]
- fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
- self.nodes[m.0..]
- .iter_mut()
- .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
+ /// Detach the parsed trivia nodes from this token (but not newline info) so
+ /// that subsequent wrapping will include the trivia.
+ fn flush_trivia(&mut self) {
+ self.token.n_trivia = 0;
+ self.token.prev_end = self.token.start;
}
+ /// Wrap the nodes from a marker up to (but excluding) the current token in
+ /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an
+ /// easy interface for creating nested syntax nodes _after_ having parsed
+ /// their children.
fn wrap(&mut self, from: Marker, kind: SyntaxKind) {
- self.wrap_within(from, self.before_trivia(), kind);
- }
-
- fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) {
- self.wrap_within(from, Marker(self.nodes.len()), kind)
- }
-
- fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) {
- let len = self.nodes.len();
- let to = to.0.min(len);
+ let to = self.before_trivia().0;
let from = from.0.min(to);
let children = self.nodes.drain(from..to).collect();
self.nodes.insert(from, SyntaxNode::inner(kind, children));
}
- fn enter(&mut self, mode: LexMode) {
- self.modes.push(self.lexer.mode());
+ /// Parse within the [`LexMode`] for subsequent tokens (does not change the
+ /// current token). This may re-lex the final token on exit.
+ ///
+ /// This function effectively repurposes the call stack as a stack of modes.
+ fn enter_modes(
+ &mut self,
+ mode: LexMode,
+ stop: AtNewline,
+ func: impl FnOnce(&mut Parser<'s>),
+ ) {
+ let previous = self.lexer.mode();
self.lexer.set_mode(mode);
+ self.with_nl_mode(stop, func);
+ if mode != previous {
+ self.lexer.set_mode(previous);
+ self.lexer.jump(self.token.prev_end);
+ self.nodes.truncate(self.nodes.len() - self.token.n_trivia);
+ self.token = Self::lex(&mut self.nodes, &mut self.lexer, self.nl_mode);
+ }
}
- fn exit(&mut self) {
- let mode = self.modes.pop().unwrap();
- if mode != self.lexer.mode() {
- self.unskip();
- self.lexer.set_mode(mode);
- self.lexer.jump(self.current_start);
- self.lex();
- self.skip();
+ /// Parse within the [`AtNewline`] mode for subsequent tokens (does not
+ /// change the current token). This may re-lex the final token on exit.
+ ///
+ /// This function effectively repurposes the call stack as a stack of modes.
+ fn with_nl_mode(&mut self, mode: AtNewline, func: impl FnOnce(&mut Parser<'s>)) {
+ let previous = self.nl_mode;
+ self.nl_mode = mode;
+ func(self);
+ self.nl_mode = previous;
+ if let Some(newline) = self.token.newline {
+ if mode != previous {
+ // Restore our actual token's kind or insert a fake end.
+ let actual_kind = self.token.node.kind();
+ if self.nl_mode.stop_at(newline, actual_kind) {
+ self.token.kind = SyntaxKind::End;
+ } else {
+ self.token.kind = actual_kind;
+ }
+ }
}
}
- fn enter_newline_mode(&mut self, stop: NewlineMode) {
- self.newline_modes.push(stop);
- }
+ /// Move the lexer forward and prepare the current token. In Code, this
+ /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
+ ///
+ /// This is not a method on `self` because we need a valid token before we
+ /// can initialize the parser.
+ fn lex(nodes: &mut Vec<SyntaxNode>, lexer: &mut Lexer, nl_mode: AtNewline) -> Token {
+ let prev_end = lexer.cursor();
+ let mut start = prev_end;
+ let (mut kind, mut node) = lexer.next();
+ let mut n_trivia = 0;
+ let mut had_newline = false;
+ let mut parbreak = false;
+
+ while kind.is_trivia() {
+ had_newline |= lexer.newline(); // Newlines are always trivia.
+ parbreak |= kind == SyntaxKind::Parbreak;
+ n_trivia += 1;
+ nodes.push(node);
+ start = lexer.cursor();
+ (kind, node) = lexer.next();
+ }
+
+ let newline = if had_newline {
+ let column = (lexer.mode() == LexMode::Markup).then(|| lexer.column(start));
+ let newline = Newline { column, parbreak };
+ if nl_mode.stop_at(newline, kind) {
+ // Insert a temporary `SyntaxKind::End` to halt the parser.
+ // The actual kind will be restored from `node` later.
+ kind = SyntaxKind::End;
+ }
+ Some(newline)
+ } else {
+ None
+ };
- fn exit_newline_mode(&mut self) {
- self.unskip();
- self.newline_modes.pop();
- self.lexer.jump(self.prev_end);
- self.lex();
- self.skip();
+ Token { kind, node, n_trivia, newline, start, prev_end }
}
+}
- fn checkpoint(&self) -> Checkpoint<'s> {
- Checkpoint {
- lexer: self.lexer.clone(),
- prev_end: self.prev_end,
- current_start: self.current_start,
- current: self.current,
- nodes: self.nodes.len(),
- }
- }
+/// Extra parser state for efficiently recovering from mispredicted parses.
+///
+/// This is the same idea as packrat parsing, but we use it only in the limited
+/// case of parenthesized structures. See [`expr_with_paren`] for more.
+#[derive(Default)]
+struct MemoArena {
+ /// A single arena of previously parsed nodes (to reduce allocations).
+ /// Memoized ranges refer to unique sections of the arena.
+ arena: Vec<SyntaxNode>,
+ /// A map from the parser's current position to a range of previously parsed
+ /// nodes in the arena and a checkpoint of the parser's state. These allow
+ /// us to reset the parser to avoid parsing the same location again.
+ memo_map: HashMap<MemoKey, (Range<usize>, PartialState)>,
+}
+
+/// A type alias for the memo key so it doesn't get confused with other usizes.
+///
+/// The memo is keyed by the index into `text` of the current token's start.
+type MemoKey = usize;
- fn restore(&mut self, checkpoint: Checkpoint<'s>) {
- self.lexer = checkpoint.lexer;
- self.prev_end = checkpoint.prev_end;
- self.current_start = checkpoint.current_start;
- self.current = checkpoint.current;
- self.nodes.truncate(checkpoint.nodes);
- }
+/// A checkpoint of the parser which can fully restore it to a previous state.
+struct Checkpoint {
+ node_len: usize,
+ state: PartialState,
+}
- fn skip(&mut self) {
- if self.lexer.mode() != LexMode::Markup {
- while self.current.is_trivia() {
- self.save();
- self.lex();
- }
- }
- }
+/// State needed to restore the parser's current token and the lexer (but not
+/// the nodes vector).
+#[derive(Clone)]
+struct PartialState {
+ cursor: usize,
+ lex_mode: LexMode,
+ token: Token,
+}
- fn unskip(&mut self) {
- if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
- while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) {
- self.nodes.pop();
+/// The Memoization interface.
+impl<'s> Parser<'s> {
+ /// Store the already parsed nodes and the parser state into the memo map by
+ /// extending the arena and storing the extended range and a checkpoint.
+ fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
+ let Checkpoint { state, node_len } = self.checkpoint();
+ let memo_start = self.memo.arena.len();
+ self.memo.arena.extend_from_slice(&self.nodes[prev_len..node_len]);
+ let arena_range = memo_start..self.memo.arena.len();
+ self.memo.memo_map.insert(key, (arena_range, state));
+ }
+
+ /// Try to load a memoized result, return `None` if we did or `Some` (with a
+ /// checkpoint and a key for the memo map) if we didn't.
+ fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint)> {
+ // We use the starting index of the current token as our key.
+ let key: MemoKey = self.current_start();
+ match self.memo.memo_map.get(&key).cloned() {
+ Some((range, state)) => {
+ self.nodes.extend_from_slice(&self.memo.arena[range]);
+ // It's important that we don't truncate the nodes vector since
+ // it may have grown or shrunk (due to other memoization or
+ // error reporting) since we made this checkpoint.
+ self.restore_partial(state);
+ None
}
-
- self.lexer.jump(self.prev_end);
- self.lex();
+ None => Some((key, self.checkpoint())),
}
}
- fn save(&mut self) {
- let text = self.current_text();
- if self.at(SyntaxKind::Error) {
- let error = self.lexer.take_error().unwrap();
- self.nodes.push(SyntaxNode::error(error, text));
- } else {
- self.nodes.push(SyntaxNode::leaf(self.current, text));
- }
-
- if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() {
- self.prev_end = self.current_end();
- }
+ /// Restore the parser to the state at a checkpoint.
+ fn restore(&mut self, checkpoint: Checkpoint) {
+ self.nodes.truncate(checkpoint.node_len);
+ self.restore_partial(checkpoint.state);
}
- fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
- loop {
- let next = lexer.next();
- // Loop is terminable, because SyntaxKind::End is not a trivia.
- if !next.is_trivia() {
- break next;
- }
- }
+ /// Restore parts of the checkpoint excluding the nodes vector.
+ fn restore_partial(&mut self, state: PartialState) {
+ self.lexer.jump(state.cursor);
+ self.lexer.set_mode(state.lex_mode);
+ self.token = state.token;
}
- fn lex(&mut self) {
- self.current_start = self.lexer.cursor();
- self.current = self.lexer.next();
-
- // Special cases to handle newlines in code mode.
- if self.lexer.mode() == LexMode::Code
- && self.lexer.newline()
- && match self.newline_modes.last() {
- Some(NewlineMode::Continue) => false,
- Some(NewlineMode::Contextual) => !matches!(
- Self::next_non_trivia(&mut self.lexer.clone()),
- SyntaxKind::Else | SyntaxKind::Dot
- ),
- Some(NewlineMode::Stop) => true,
- None => false,
- }
- {
- self.current = SyntaxKind::End;
- }
+ /// Save a checkpoint of the parser state.
+ fn checkpoint(&self) -> Checkpoint {
+ let node_len = self.nodes.len();
+ let state = PartialState {
+ cursor: self.lexer.cursor(),
+ lex_mode: self.lexer.mode(),
+ token: self.token.clone(),
+ };
+ Checkpoint { node_len, state }
}
}
+/// Functions for eating expected or unexpected tokens and generating errors if
+/// we don't get what we expect.
impl<'s> Parser<'s> {
- /// Consume the given syntax `kind` or produce an error.
+ /// Consume the given `kind` or produce an error.
fn expect(&mut self, kind: SyntaxKind) -> bool {
let at = self.at(kind);
if at {
self.eat();
- } else if kind == SyntaxKind::Ident && self.current.is_keyword() {
+ } else if kind == SyntaxKind::Ident && self.token.kind.is_keyword() {
self.trim_errors();
self.eat_and_get().expected(kind.name());
} else {
@@ -1828,6 +1999,12 @@ impl<'s> Parser<'s> {
}
}
+ /// Whether the last non-trivia node is an error.
+ fn after_error(&mut self) -> bool {
+ let m = self.before_trivia();
+ m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
+ }
+
/// Produce an error that the given `thing` was expected at the position
/// of the marker `m`.
fn expected_at(&mut self, m: Marker, thing: &str) {
@@ -1836,7 +2013,7 @@ impl<'s> Parser<'s> {
self.nodes.insert(m.0, error);
}
- /// Produce a hint.
+ /// Add a hint to a trailing error.
fn hint(&mut self, hint: &str) {
let m = self.before_trivia();
if let Some(error) = self.nodes.get_mut(m.0 - 1) {
@@ -1848,7 +2025,7 @@ impl<'s> Parser<'s> {
/// unexpected.
fn unexpected(&mut self) {
self.trim_errors();
- self.balanced &= !self.current.is_grouping();
+ self.balanced &= !self.token.kind.is_grouping();
self.eat_and_get().unexpected();
}
@@ -1865,17 +2042,3 @@ impl<'s> Parser<'s> {
self.nodes.drain(start..end);
}
}
-
-impl Index<Marker> for Parser<'_> {
- type Output = SyntaxNode;
-
- fn index(&self, m: Marker) -> &Self::Output {
- &self.nodes[m.0]
- }
-}
-
-impl IndexMut<Marker> for Parser<'_> {
- fn index_mut(&mut self, m: Marker) -> &mut Self::Output {
- &mut self.nodes[m.0]
- }
-}
diff --git a/crates/typst-syntax/src/reparser.rs b/crates/typst-syntax/src/reparser.rs
index 7a970490..c20d8314 100644
--- a/crates/typst-syntax/src/reparser.rs
+++ b/crates/typst-syntax/src/reparser.rs
@@ -157,19 +157,13 @@ fn try_reparse(
let new_range = shifted..shifted + new_len;
let at_end = end == children.len();
- // Stop parsing early if this kind is encountered.
- let stop_kind = match parent_kind {
- Some(_) => SyntaxKind::RightBracket,
- None => SyntaxKind::End,
- };
-
// Reparse!
let reparsed = reparse_markup(
text,
new_range.clone(),
&mut at_start,
&mut nesting,
- |kind| kind == stop_kind,
+ parent_kind.is_none(),
);
if let Some(newborns) = reparsed {
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index eaee7ef2..014aaf2f 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -58,6 +58,7 @@ pub const STMT: SyntaxSet = syntax_set!(Let, Set, Show, Import, Include, Return)
pub const MATH_EXPR: SyntaxSet = syntax_set!(
Hash,
MathIdent,
+ FieldAccess,
Text,
MathShorthand,
Linebreak,
@@ -104,7 +105,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = syntax_set!(
Numeric,
Str,
Label,
- RawDelim,
+ Raw,
);
/// Syntax kinds that are unary operators.