2 files changed, 193 insertions, 17 deletions
diff --git a/crates/typst-syntax/README.md b/crates/typst-syntax/README.md
new file mode 100644
index 00000000..ced4096e
--- /dev/null
+++ b/crates/typst-syntax/README.md
@@ -0,0 +1,40 @@
+# typst-syntax
+
+Welcome to the Typst Syntax crate! This crate manages the syntactical structure
+of Typst by holding some core abstractions like assigning source file ids,
+parsing Typst syntax, creating an Abstract Syntax Tree (AST), initializing
+source "spans" (for linking AST elements to their outputs in a document), and
+syntax highlighting.
+
+Below are quick descriptions of the files you might be editing if you find
+yourself here :)
+
+- `lexer.rs`: The lexical foundation of the parser, which converts a string of
+  characters into tokens.
+- `parser.rs`: The main parser definition, preparing a Concrete Syntax Tree made
+  of nested vectors of `SyntaxNode`s.
+- `reparser.rs`: The algorithm for reparsing the minimal required amount of
+  source text for efficient incremental compilation.
+- `ast.rs`: The conversion layer between the Concrete Syntax Tree of the parser
+  and the Abstract Syntax Tree used for code evaluation.
+- `node.rs` & `span.rs`: The underlying data structure for the Concrete Syntax
+  Tree and the definitions of source spans used for efficiently pointing to a
+  syntax node in things like diagnostics.
+- `kind.rs` & `set.rs`: An enum with all syntactical tokens and nodes and
+  bit-set data structure for sets of `SyntaxKind`s.
+- `highlight.rs`: Extracting of syntax highlighting information out of the
+  Concrete Syntax Tree (and outputting as HTML).
+- `path.rs`, `file.rs`, `package.rs`: The system for interning project and
+  package paths as unique file IDs and resolving them in a virtual filesystem
+  (not actually for _opening_ files).
+
+The structure of the parser is largely adapted from Rust Analyzer. Their
+[documentation][ra] is a good reference for a number of the design decisions
+around the parser and AST.
+
+The reparsing algorithm is explained in Section 4 of [Martin's thesis][thesis]
+(though it changed a bit since).
+
+[ra]: https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/syntax.md
+[thesis]:
+    https://www.researchgate.net/publication/364622490_Fast_Typesetting_with_Incremental_Compilation
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index 8c783ffe..afa47257 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -10,7 +10,7 @@ use crate::{
     ast, is_ident, is_newline, set, LexMode, Lexer, SyntaxError, SyntaxKind, SyntaxNode,
 };
 
-/// Parses a source file.
+/// Parses a source file as top-level markup.
 pub fn parse(text: &str) -> SyntaxNode {
     let _scope = typst_timing::TimingScope::new("parse");
     let mut p = Parser::new(text, 0, LexMode::Markup);
@@ -37,7 +37,7 @@ pub fn parse_math(text: &str) -> SyntaxNode {
     p.finish().into_iter().next().unwrap()
 }
 
-/// Parses the contents of a file or content block.
+/// Parses markup expressions until a stop condition is met.
 fn markup(
     p: &mut Parser,
     mut at_start: bool,
@@ -96,7 +96,7 @@ pub(super) fn reparse_markup(
     (p.balanced && p.current_start() == range.end).then(|| p.finish())
 }
 
-/// Parses a single markup expression: This includes markup elements like
+/// Parses a single markup expression. This includes markup elements like
 /// spaces, text, and headings, and embedded code expressions.
 fn markup_expr(p: &mut Parser, at_start: &mut bool) {
     match p.current() {
@@ -414,6 +414,7 @@ fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) {
     }
 }
 
+/// Try to parse delimiters based on the current token's unicode math class.
 fn maybe_delimited(p: &mut Parser) -> bool {
     let open = math_class(p.current_text()) == Some(MathClass::Opening);
     if open {
@@ -422,6 +423,7 @@ fn maybe_delimited(p: &mut Parser) -> bool {
     open
 }
 
+/// Parse matched delimiters in math: `[x + y]`.
 fn math_delimited(p: &mut Parser) {
     let m = p.marker();
     p.eat();
@@ -444,6 +446,8 @@ fn math_delimited(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Math);
 }
 
+/// Remove one set of parentheses (if any) from a previously parsed expression
+/// by converting to non-expression SyntaxKinds.
 fn math_unparen(p: &mut Parser, m: Marker) {
     let Some(node) = p.nodes.get_mut(m.0) else { return };
     if node.kind() != SyntaxKind::MathDelimited {
@@ -460,6 +464,10 @@ fn math_unparen(p: &mut Parser, m: Marker) {
     node.convert_to_kind(SyntaxKind::Math);
 }
 
+/// The unicode math class of a string. Only returns `Some` if `text` has
+/// exactly one unicode character or is a math shorthand string (currently just
+/// `[|`, `||`, `|]`) and then only returns `Some` if there is a math class
+/// defined for that character.
 fn math_class(text: &str) -> Option<MathClass> {
     match text {
         "[|" => return Some(MathClass::Opening),
@@ -475,6 +483,7 @@ fn math_class(text: &str) -> Option<MathClass> {
         .and_then(unicode_math_class::class)
 }
 
+/// Precedence and wrapper kinds for the binary math operators.
 fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> {
     match kind {
         SyntaxKind::Underscore => {
@@ -490,6 +499,7 @@ fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usiz
     }
 }
 
+/// Parse an argument list in math: `(a, b; c, d; size: #50%)`.
 fn math_args(p: &mut Parser) {
     let m = p.marker();
     p.convert(SyntaxKind::LeftParen);
@@ -629,7 +639,7 @@ fn code_expr(p: &mut Parser) {
     code_expr_prec(p, false, 0)
 }
 
-/// Parses a code expression embedded in markup or math.
+/// Parses an atomic code expression embedded in markup or math.
 fn embedded_code_expr(p: &mut Parser) {
     p.enter_newline_mode(NewlineMode::Stop);
     p.enter(LexMode::Code);
@@ -1130,6 +1140,21 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
         seen: HashSet::new(),
     };
 
+    // An edge case with parens is whether we can interpret a leading spread
+    // expression as a dictionary, e.g. if we want `(..dict1, ..dict2)` to join
+    // the two dicts.
+    //
+    // The issue is that we decide on the type of the parenthesized expression
+    // here in the parser by the `SyntaxKind` we wrap with, instead of in eval
+    // based on the type of the spread item.
+    //
+    // The current fix is that we allow a leading colon to force the
+    // parenthesized value into a dict:
+    // - `(..arr1, ..arr2)` is wrapped as an `Array`.
+    // - `(: ..dict1, ..dict2)` is wrapped as a `Dict`.
+    //
+    // This does allow some unexpected expressions, such as `(: key: val)`, but
+    // it's currently intentional.
     if p.eat_if(SyntaxKind::Colon) {
         state.kind = Some(SyntaxKind::Dict);
         state.maybe_just_parens = false;
@@ -1165,8 +1190,13 @@ fn parenthesized_or_array_or_dict(p: &mut Parser) -> SyntaxKind {
 /// State for array/dictionary parsing.
 struct GroupState {
     count: usize,
+    /// Whether this is just a single expression in parens: `(a)`. Single
+    /// element arrays require an explicit comma: `(a,)`, unless we're
+    /// spreading: `(..a)`.
     maybe_just_parens: bool,
+    /// The `SyntaxKind` to wrap as (if we've figured it out yet).
     kind: Option<SyntaxKind>,
+    /// Store named arguments so we can give an error if they're repeated.
     seen: HashSet<EcoString>,
 }
 
@@ -1484,32 +1514,90 @@ fn pattern_leaf<'s>(
     }
 }
 
-/// Manages parsing of a stream of tokens.
+/// Manages parsing a stream of tokens into a tree of [`SyntaxNode`]s.
+///
+/// The implementation presents an interface that investigates a `current` token
+/// and can take one of the following actions:
+///
+/// 1. Eat a token, pushing `current` into the `nodes` vector as a [leaf
+///    node](`SyntaxNode::leaf`) and prepare a new `current` by calling into the
+///    lexer.
+/// 2. Wrap nodes from a marker to the end of `nodes` (excluding `current`) into
+///    an [inner node](`SyntaxNode::inner`) of a specific [`SyntaxKind`].
+/// 3. Produce or convert nodes into an [error node](`SyntaxNode::error`) when
+///    something expected is missing or something unexpected is found.
+///
+/// Overall the parser produces a nested tree of SyntaxNodes as a "_Concrete_
+/// Syntax Tree." The raw Concrete Syntax Tree should contain the entire source
+/// text, and is used as-is for e.g. syntax highlighting and IDE features. In
+/// `ast.rs` the CST is interpreted as a lazy view over an "_Abstract_ Syntax
+/// Tree." The AST module skips over irrelevant tokens -- whitespace, comments,
+/// code parens, commas in function args, etc. -- as it iterates through the
+/// tree.
+///
+/// ### Modes
+///
+/// The parser manages the transitions between the three modes of Typst through
+/// stacks of [lexer modes](`LexMode`) and [newline modes](`NewlineMode`).
+///
+/// The lexer modes map to the three Typst modes and are stored in the lexer,
+/// changing which`SyntaxKind`s it will generate. The mode also affects how the
+/// parser treats trivia tokens (comments and whitespace). In Markup, trivia is
+/// handled manually to deal with list indentation and must be explicitly eaten.
+/// In Code and Math, trivia is managed internally and is implicitly eaten by
+/// pushing onto the end of the `nodes` vector until a non-trivia kind is found.
+///
+/// The newline mode is used in Code to determine whether a newline should end
+/// the current expression. If so, the parser temporarily changes the current
+/// token's kind to a fake [`SyntaxKind::End`]. When the parser exits the mode
+/// the original `SyntaxKind` is restored.
 struct Parser<'s> {
+    /// The source text shared with the lexer.
     text: &'s str,
+    /// A lexer over the source text with multiple modes. Defines the boundaries
+    /// of tokens and determines their [`SyntaxKind`].
     lexer: Lexer<'s>,
+    /// The index into `text` of the end of the previous token.
     prev_end: usize,
+    /// The index into `text` of the start of our current token (the end is
+    /// stored as the lexer's cursor).
     current_start: usize,
+    /// The [`SyntaxKind`] of the current token.
     current: SyntaxKind,
+    /// Whether the parser has the expected set of open/close delimiters. This
+    /// only ever transitions from `true` to `false`.
     balanced: bool,
+    /// Nodes representing the concrete syntax tree of previously parsed text.
+    /// In Code and Math, includes previously parsed trivia, but not `current`.
     nodes: Vec<SyntaxNode>,
+    /// Stack of lexer modes to be pushed/popped. The current mode is implicitly
+    /// stored in the lexer.
     modes: Vec<LexMode>,
+    /// Stack of newline modes to be pushed/popped. The current mode is the tail
+    /// of the vector.
     newline_modes: Vec<NewlineMode>,
+    /// Parser checkpoints for a given text index. Used for efficient parser
+    /// backtracking similar to packrat parsing. See comments above in
+    /// [`expr_with_paren`].
     memo: HashMap<usize, (Range<usize>, Checkpoint<'s>)>,
+    /// The stored parse results at each checkpoint.
     memo_arena: Vec<SyntaxNode>,
 }
 
-/// How to proceed with parsing when seeing a newline.
+/// How to proceed with parsing when at a newline in Code.
 #[derive(Clone)]
 enum NewlineMode {
-    /// Stop always.
+    /// Stop at any newline.
     Stop,
-    /// Proceed if there is no continuation with `else` or `.`
+    /// Continue only if there is no continuation with `else` or `.`.
     Contextual,
-    /// Just proceed like with normal whitespace.
+    /// Continue at newlines.
     Continue,
 }
 
+/// A marker representing a node's position in the parser. Mainly used for
+/// wrapping, but can also index into the parser to access the node, like
+/// `p[m]`.
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 struct Marker(usize);
 
@@ -1523,6 +1611,7 @@ struct Checkpoint<'s> {
 }
 
 impl<'s> Parser<'s> {
+    /// Create a new parser starting from the given text offset and lexer mode.
     fn new(text: &'s str, offset: usize, mode: LexMode) -> Self {
         let mut lexer = Lexer::new(text, mode);
         lexer.jump(offset);
@@ -1542,52 +1631,68 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Consume the parser, yielding the full vector of parsed SyntaxNodes.
     fn finish(self) -> Vec<SyntaxNode> {
         self.nodes
     }
 
+    /// The offset into `text` of the previous token's end.
     fn prev_end(&self) -> usize {
         self.prev_end
     }
 
+    /// Similar to a `peek()` function: returns the `kind` of the next token to
+    /// be eaten.
     fn current(&self) -> SyntaxKind {
         self.current
     }
 
+    /// The offset into `text` of the current token's start.
     fn current_start(&self) -> usize {
         self.current_start
     }
 
+    /// The offset into `text` of the current token's end.
     fn current_end(&self) -> usize {
         self.lexer.cursor()
     }
 
+    /// The current token's text.
     fn current_text(&self) -> &'s str {
         &self.text[self.current_start..self.current_end()]
     }
 
+    /// Whether the current token is a given [`SyntaxKind`].
     fn at(&self, kind: SyntaxKind) -> bool {
         self.current == kind
     }
 
+    /// Whether the current token is contained in a [`SyntaxSet`].
     fn at_set(&self, set: SyntaxSet) -> bool {
         set.contains(self.current)
     }
 
+    /// Whether we're at the end of the token stream.
+    ///
+    /// Note: This might be a fake end due to the newline mode.
     fn end(&self) -> bool {
         self.at(SyntaxKind::End)
     }
 
+    /// If we're at the given `kind` with no preceding trivia tokens.
     fn directly_at(&self, kind: SyntaxKind) -> bool {
         self.current == kind && self.prev_end == self.current_start
     }
 
+    /// Eat the current token by saving it to the `nodes` vector, then move
+    /// the lexer forward to prepare a new token.
     fn eat(&mut self) {
         self.save();
         self.lex();
         self.skip();
     }
 
+    /// Eat the current node and return a reference for in-place mutation.
     #[track_caller]
     fn eat_and_get(&mut self) -> &mut SyntaxNode {
         let offset = self.nodes.len();
@@ -1597,9 +1702,9 @@ impl<'s> Parser<'s> {
         &mut self.nodes[offset]
     }
 
-    /// Eats if at `kind`.
+    /// Eat the token if at `kind`. Returns `true` if eaten.
     ///
-    /// Note: In math and code mode, this will ignore trivia in front of the
+    /// Note: In Math and Code, this will ignore trivia in front of the
     /// `kind`, To forbid skipping trivia, consider using `eat_if_direct`.
     fn eat_if(&mut self, kind: SyntaxKind) -> bool {
         let at = self.at(kind);
@@ -1609,7 +1714,8 @@ impl<'s> Parser<'s> {
         at
     }
 
-    /// Eats only if currently at the start of `kind`.
+    /// Eat the token only if at `kind` with no preceding trivia. Returns `true`
+    /// if eaten.
     fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool {
         let at = self.directly_at(kind);
         if at {
@@ -1618,30 +1724,39 @@ impl<'s> Parser<'s> {
         at
     }
 
+    /// Assert that we are at the given [`SyntaxKind`] and eat it. This should
+    /// be used when moving between functions that expect to start with a
+    /// specific token.
     #[track_caller]
     fn assert(&mut self, kind: SyntaxKind) {
         assert_eq!(self.current, kind);
         self.eat();
     }
 
+    /// Convert the current token's [`SyntaxKind`] and eat it.
     fn convert(&mut self, kind: SyntaxKind) {
         self.current = kind;
         self.eat();
     }
 
+    /// Whether the current token is a newline, only used in Markup.
     fn newline(&mut self) -> bool {
         self.lexer.newline()
     }
 
+    /// The number of characters until the most recent newline in `text`.
     fn column(&self, at: usize) -> usize {
         self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count()
     }
 
+    /// A marker that will point to the current token in the parser once it's
+    /// been eaten.
     fn marker(&self) -> Marker {
         Marker(self.nodes.len())
     }
 
-    /// Get a marker after the last non-trivia node.
+    /// A marker that will point to first trivia before this token in the
+    /// parser (or the token itself if no trivia precede it).
     fn before_trivia(&self) -> Marker {
         let mut i = self.nodes.len();
         if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
@@ -1658,6 +1773,7 @@ impl<'s> Parser<'s> {
         m.0 > 0 && self.nodes[m.0 - 1].kind().is_error()
     }
 
+    /// Iterate over the non-trivia tokens following the marker.
     #[track_caller]
     fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> {
         self.nodes[m.0..]
@@ -1665,10 +1781,15 @@ impl<'s> Parser<'s> {
             .filter(|child| !child.kind().is_error() && !child.kind().is_trivia())
     }
 
+    /// Wrap the nodes from a marker up to (but excluding) the current token in
+    /// a new [inner node](`SyntaxNode::inner`) of the given kind. This is an
+    /// easy interface for creating nested syntax nodes _after_ having parsed
+    /// their children.
     fn wrap(&mut self, from: Marker, kind: SyntaxKind) {
         self.wrap_within(from, self.before_trivia(), kind);
     }
 
+    /// Wrap including any trailing trivia nodes.
     fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) {
         self.wrap_within(from, Marker(self.nodes.len()), kind)
     }
@@ -1681,11 +1802,14 @@ impl<'s> Parser<'s> {
         self.nodes.insert(from, SyntaxNode::inner(kind, children));
     }
 
+    /// Enter a new [`LexMode`] that will affect subsequent tokens (does not
+    /// modify the current token).
     fn enter(&mut self, mode: LexMode) {
         self.modes.push(self.lexer.mode());
         self.lexer.set_mode(mode);
     }
 
+    /// Exit the current [`LexMode`], possibly re-lexing the current token.
     fn exit(&mut self) {
         let mode = self.modes.pop().unwrap();
         if mode != self.lexer.mode() {
@@ -1697,10 +1821,13 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Enter a new [`NewlineMode`] that will affect subsequent tokens (does not
+    /// modify the current token).
     fn enter_newline_mode(&mut self, stop: NewlineMode) {
         self.newline_modes.push(stop);
     }
 
+    /// Exit the current [`NewlineMode`], possibly re-lexing the current token.
     fn exit_newline_mode(&mut self) {
         self.unskip();
         self.newline_modes.pop();
@@ -1709,6 +1836,7 @@ impl<'s> Parser<'s> {
         self.skip();
     }
 
+    /// Save a checkpoint of the parser state.
     fn checkpoint(&self) -> Checkpoint<'s> {
         Checkpoint {
             lexer: self.lexer.clone(),
@@ -1719,6 +1847,7 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Reset the parser from a checkpoint.
     fn restore(&mut self, checkpoint: Checkpoint<'s>) {
         self.lexer = checkpoint.lexer;
         self.prev_end = checkpoint.prev_end;
@@ -1727,6 +1856,7 @@ impl<'s> Parser<'s> {
         self.nodes.truncate(checkpoint.nodes);
     }
 
+    /// Move past trivia nodes in Code/Math.
     fn skip(&mut self) {
         if self.lexer.mode() != LexMode::Markup {
             while self.current.is_trivia() {
@@ -1736,6 +1866,8 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Move the parser back to the start of this token or its leading trivia
+    /// (in Code/Math).
     fn unskip(&mut self) {
         if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start {
             while self.nodes.last().is_some_and(|last| last.kind().is_trivia()) {
@@ -1747,6 +1879,7 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Save the current token to the `nodes` vector as an Inner or Error node.
     fn save(&mut self) {
         let text = self.current_text();
         if self.at(SyntaxKind::Error) {
@@ -1761,21 +1894,24 @@ impl<'s> Parser<'s> {
         }
     }
 
+    /// Find the kind of the next non-trivia token in the lexer.
     fn next_non_trivia(lexer: &mut Lexer<'s>) -> SyntaxKind {
         loop {
             let next = lexer.next();
-            // Loop is terminable, because SyntaxKind::End is not a trivia.
+            // Loop is terminable, because `SyntaxKind::End` is not a trivia.
             if !next.is_trivia() {
                 break next;
             }
         }
     }
 
+    /// Move the lexer forward and prepare the current token. In Code, this
+    /// might insert a temporary [`SyntaxKind::End`] based on our newline mode.
     fn lex(&mut self) {
         self.current_start = self.lexer.cursor();
         self.current = self.lexer.next();
 
-        // Special cases to handle newlines in code mode.
+        // Special cases to handle newlines in Code.
         if self.lexer.mode() == LexMode::Code
             && self.lexer.newline()
             && match self.newline_modes.last() {
@@ -1794,7 +1930,7 @@ impl<'s> Parser<'s> {
 }
 
 impl<'s> Parser<'s> {
-    /// Consume the given syntax `kind` or produce an error.
+    /// Consume the given `kind` or produce an error.
     fn expect(&mut self, kind: SyntaxKind) -> bool {
         let at = self.at(kind);
         if at {
@@ -1833,7 +1969,7 @@ impl<'s> Parser<'s> {
         self.nodes.insert(m.0, error);
     }
 
-    /// Produce a hint.
+    /// Add a hint to a trailing error.
     fn hint(&mut self, hint: &str) {
         let m = self.before_trivia();
         if let Some(error) = self.nodes.get_mut(m.0 - 1) {