summaryrefslogtreecommitdiff
path: root/crates/typst-syntax
diff options
context:
space:
mode:
Diffstat (limited to 'crates/typst-syntax')
-rw-r--r--crates/typst-syntax/src/parser.rs153
1 files changed, 102 insertions, 51 deletions
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 50277fab..2a7e4611 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -1057,29 +1057,25 @@ fn return_stmt(p: &mut Parser) {
/// An expression that starts with a parenthesis.
fn expr_with_paren(p: &mut Parser, atomic: bool) {
- // If we've seen this position before and have a memoized result, just use
- // it. See below for more explanation about this memoization.
- let start = p.current_start();
- if let Some((range, end_point)) = p.memo.get(&start).cloned() {
- // Restore the end point first, so that it doesn't truncate our freshly
- // pushed nodes. If the current length of `p.nodes` doesn't match what
- // we had in the memoized run, this might otherwise happen.
- p.restore(end_point);
- p.nodes.extend(p.memo_arena[range].iter().cloned());
+ if atomic {
+ // Atomic expressions aren't modified by operators that follow them, so
+ // our first guess of array/dict will be correct.
+ parenthesized_or_array_or_dict(p);
return;
}
- let m = p.marker();
- let checkpoint = p.checkpoint();
+ // If we've seen this position before and have a memoized result, restore it
+ // and return. Otherwise, get a key to this position and a checkpoint to
+ // restart from in case we make a wrong prediction.
+ let Some((memo_key, checkpoint)) = p.restore_memo_or_checkpoint() else { return };
+ // The node length from when we restored.
+ let prev_len = checkpoint.node_len;
// When we reach a '(', we can't be sure what it is. First, we attempt to
// parse as a simple parenthesized expression, array, or dictionary as
// these are the most likely things. We can handle all of those in a single
// pass.
let kind = parenthesized_or_array_or_dict(p);
- if atomic {
- return;
- }
// If, however, '=>' or '=' follows, we must backtrack and reparse as either
// a parameter list or a destructuring. To be able to do that, we created a
@@ -1100,6 +1096,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
// case running time of O(2n).
if p.at(SyntaxKind::Arrow) {
p.restore(checkpoint);
+ let m = p.marker();
params(p);
if !p.expect(SyntaxKind::Arrow) {
return;
@@ -1108,6 +1105,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
p.wrap(m, SyntaxKind::Closure);
} else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized {
p.restore(checkpoint);
+ let m = p.marker();
destructuring_or_parenthesized(p, true, &mut HashSet::new());
if !p.expect(SyntaxKind::Eq) {
return;
@@ -1119,9 +1117,7 @@ fn expr_with_paren(p: &mut Parser, atomic: bool) {
}
// Memoize result if we backtracked.
- let offset = p.memo_arena.len();
- p.memo_arena.extend(p.nodes[m.0..].iter().cloned());
- p.memo.insert(start, (offset..p.memo_arena.len(), p.checkpoint()));
+ p.memoize_parsed_nodes(memo_key, prev_len);
}
/// Parses either
@@ -1456,6 +1452,9 @@ fn destructuring_item<'s>(
// Parse a normal positional pattern or a destructuring key.
let was_at_pat = p.at_set(set::PATTERN);
+
+ // We must use a full checkpoint here (can't just clone the lexer) because
+ // there may be trivia between the identifier and the colon we need to skip.
let checkpoint = p.checkpoint();
if !(p.eat_if(SyntaxKind::Ident) && p.at(SyntaxKind::Colon)) {
p.restore(checkpoint);
@@ -1579,9 +1578,7 @@ struct Parser<'s> {
/// Parser checkpoints for a given text index. Used for efficient parser
/// backtracking similar to packrat parsing. See comments above in
/// [`expr_with_paren`].
- memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
- /// The stored parse results at each checkpoint.
- memo_arena: Vec<SyntaxNode>,
+ memo: MemoArena<'s>,
}
/// How to proceed with parsing when at a newline in Code.
@@ -1601,15 +1598,6 @@ enum NewlineMode {
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
struct Marker(usize);
-#[derive(Clone)]
-struct Checkpoint<'s> {
- lexer: Lexer<'s>,
- prev_end: usize,
- current_start: usize,
- current: SyntaxKind,
- nodes: usize,
-}
-
impl<'s> Parser<'s> {
/// Create a new parser starting from the given text offset and lexer mode.
fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
@@ -1626,8 +1614,7 @@ impl<'s> Parser<'s> {
nodes: vec![],
modes: vec![],
newline_modes: vec![],
- memo: HashMap::new(),
- memo_arena: vec![],
+ memo: Default::default(),
}
}
@@ -1836,26 +1823,6 @@ impl<'s> Parser<'s> {
self.skip();
}
- /// Save a checkpoint of the parser state.
- fn checkpoint(&self) -> Checkpoint<'s> {
- Checkpoint {
- lexer: self.lexer.clone(),
- prev_end: self.prev_end,
- current_start: self.current_start,
- current: self.current,
- nodes: self.nodes.len(),
- }
- }
-
- /// Reset the parser from a checkpoint.
- fn restore(&mut self, checkpoint: Checkpoint<'s>) {
- self.lexer = checkpoint.lexer;
- self.prev_end = checkpoint.prev_end;
- self.current_start = checkpoint.current_start;
- self.current = checkpoint.current;
- self.nodes.truncate(checkpoint.nodes);
- }
-
/// Move past trivia nodes in Code/Math.
fn skip(&mut self) {
if self.lexer.mode() != LexMode::Markup {
@@ -1929,6 +1896,90 @@ impl<'s> Parser<'s> {
}
}
+/// Extra parser state for efficiently recovering from mispredicted parses.
+///
+/// This is the same idea as packrat parsing, but we use it only in the limited
+/// case of parenthesized structures. See [`expr_with_paren`] for more.
+#[derive(Default)]
+struct MemoArena<'s> {
+ /// A single arena of previously parsed nodes (to reduce allocations).
+ /// Memoized ranges refer to unique sections of the arena.
+ arena: Vec<SyntaxNode>,
+ /// A map from the parser's current position to a range of previously parsed
+ /// nodes in the arena and a checkpoint of the parser's state. These allow
+ /// us to reset the parser to avoid parsing the same location again.
+ memo_map: HashMap<MemoKey, (Range<usize>, Checkpoint<'s>)>,
+}
+
+/// A type alias for the memo key so it doesn't get confused with other usizes.
+///
+/// The memo is keyed by the index into `text` of the current token's start.
+type MemoKey = usize;
+
+/// A checkpoint of the parser which can fully restore it to a previous state.
+#[derive(Clone)]
+struct Checkpoint<'s> {
+ lexer: Lexer<'s>,
+ prev_end: usize,
+ current_start: usize,
+ current: SyntaxKind,
+ node_len: usize,
+}
+
+impl<'s> Parser<'s> {
+ /// Store the already parsed nodes and the parser state into the memo map by
+ /// extending the arena and storing the extended range and a checkpoint.
+ fn memoize_parsed_nodes(&mut self, key: MemoKey, prev_len: usize) {
+ let memo_start = self.memo.arena.len();
+ self.memo.arena.extend_from_slice(&self.nodes[prev_len..]);
+ let arena_range = memo_start..self.memo.arena.len();
+ self.memo.memo_map.insert(key, (arena_range, self.checkpoint()));
+ }
+
+ /// Try to load a memoized result, return `None` if we did or `Some` (with a
+ /// checkpoint and a key for the memo map) if we didn't.
+ fn restore_memo_or_checkpoint(&mut self) -> Option<(MemoKey, Checkpoint<'s>)> {
+ // We use the starting index of the current token as our key.
+ let key: MemoKey = self.current_start();
+ match self.memo.memo_map.get(&key).cloned() {
+ Some((range, checkpoint)) => {
+ self.nodes.extend_from_slice(&self.memo.arena[range]);
+ // It's important that we don't truncate the nodes vector since
+ // it may have grown or shrunk (due to other memoization or
+ // error reporting) since we made this checkpoint.
+ self.restore_partial(checkpoint);
+ None
+ }
+ None => Some((key, self.checkpoint())),
+ }
+ }
+
+ /// Restore the parser to the state at a checkpoint.
+ fn restore(&mut self, checkpoint: Checkpoint<'s>) {
+ self.nodes.truncate(checkpoint.node_len);
+ self.restore_partial(checkpoint);
+ }
+
+ /// Restore parts of the checkpoint excluding the nodes vector.
+ fn restore_partial(&mut self, checkpoint: Checkpoint<'s>) {
+ self.lexer = checkpoint.lexer;
+ self.prev_end = checkpoint.prev_end;
+ self.current_start = checkpoint.current_start;
+ self.current = checkpoint.current;
+ }
+
+ /// Save a checkpoint of the parser state.
+ fn checkpoint(&self) -> Checkpoint<'s> {
+ Checkpoint {
+ lexer: self.lexer.clone(),
+ prev_end: self.prev_end,
+ current_start: self.current_start,
+ current: self.current,
+ node_len: self.nodes.len(),
+ }
+ }
+}
+
impl<'s> Parser<'s> {
/// Consume the given `kind` or produce an error.
fn expect(&mut self, kind: SyntaxKind) -> bool {