diff options
| author | Laurenz <laurmaedje@gmail.com> | 2023-07-18 20:11:31 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2023-07-18 21:04:46 +0200 |
| commit | f5953887c9ae0b40a0c3e0ab516daf425c5a598c (patch) | |
| tree | b517ca68517e49bdf458bfa92036a8ff855c72f6 /crates/typst-syntax | |
| parent | 7dc605307cf7d69a3476b8b6fc4786f683c3289b (diff) | |
Extract syntax module into typst-syntax crate
Diffstat (limited to 'crates/typst-syntax')
| -rw-r--r-- | crates/typst-syntax/Cargo.toml | 27 | ||||
| -rw-r--r-- | crates/typst-syntax/src/ast.rs | 2026 | ||||
| -rw-r--r-- | crates/typst-syntax/src/file.rs | 279 | ||||
| -rw-r--r-- | crates/typst-syntax/src/kind.rs | 480 | ||||
| -rw-r--r-- | crates/typst-syntax/src/lexer.rs | 739 | ||||
| -rw-r--r-- | crates/typst-syntax/src/lib.rs | 23 | ||||
| -rw-r--r-- | crates/typst-syntax/src/node.rs | 912 | ||||
| -rw-r--r-- | crates/typst-syntax/src/parser.rs | 1760 | ||||
| -rw-r--r-- | crates/typst-syntax/src/reparser.rs | 322 | ||||
| -rw-r--r-- | crates/typst-syntax/src/source.rs | 423 | ||||
| -rw-r--r-- | crates/typst-syntax/src/span.rs | 128 |
11 files changed, 7119 insertions, 0 deletions
diff --git a/crates/typst-syntax/Cargo.toml b/crates/typst-syntax/Cargo.toml new file mode 100644 index 00000000..928635c4 --- /dev/null +++ b/crates/typst-syntax/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "typst-syntax" +description = "Parser and syntax tree for Typst." +categories = ["compilers", "science"] +keywords = ["typst"] +version.workspace = true +rust-version.workspace = true +authors.workspace = true +edition.workspace = true +homepage.workspace = true +repository.workspace = true +license.workspace = true + +[lib] +doctest = false +bench = false + +[dependencies] +comemo = "0.3" +ecow = "0.1.1" +once_cell = "1" +serde = { version = "1", features = ["derive"] } +tracing = "0.1.37" +unicode-ident = "1.0" +unicode-math-class = "0.1" +unicode-segmentation = "1" +unscanny = "0.1" diff --git a/crates/typst-syntax/src/ast.rs b/crates/typst-syntax/src/ast.rs new file mode 100644 index 00000000..c2755d0c --- /dev/null +++ b/crates/typst-syntax/src/ast.rs @@ -0,0 +1,2026 @@ +//! A typed layer over the untyped syntax tree. +//! +//! The AST is rooted in the [`Markup`] node. + +use std::num::NonZeroUsize; +use std::ops::Deref; + +use ecow::EcoString; +use unscanny::Scanner; + +use super::{ + is_id_continue, is_id_start, is_newline, split_newlines, Span, SyntaxKind, SyntaxNode, +}; + +/// A typed AST node. +pub trait AstNode: Sized { + /// Convert a node into its typed variant. + fn from_untyped(node: &SyntaxNode) -> Option<Self>; + + /// A reference to the underlying syntax node. + fn as_untyped(&self) -> &SyntaxNode; + + /// The source code location. + fn span(&self) -> Span { + self.as_untyped().span() + } +} + +macro_rules! node { + ($(#[$attr:meta])* $name:ident) => { + #[derive(Debug, Default, Clone, Hash)] + #[repr(transparent)] + $(#[$attr])* + pub struct $name(SyntaxNode); + + impl AstNode for $name { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + if matches!(node.kind(), SyntaxKind::$name) { + Some(Self(node.clone())) + } else { + Option::None + } + } + + fn as_untyped(&self) -> &SyntaxNode { + &self.0 + } + } + }; +} + +node! { + /// The syntactical root capable of representing a full parsed document. + Markup +} + +impl Markup { + /// The expressions. + pub fn exprs(&self) -> impl DoubleEndedIterator<Item = Expr> + '_ { + let mut was_stmt = false; + self.0 + .children() + .filter(move |node| { + // Ignore newline directly after statements without semicolons. + let kind = node.kind(); + let keep = !was_stmt || node.kind() != SyntaxKind::Space; + was_stmt = kind.is_stmt(); + keep + }) + .filter_map(Expr::cast_with_space) + } +} + +/// An expression in markup, math or code. +#[derive(Debug, Clone, Hash)] +pub enum Expr { + /// Plain text without markup. + Text(Text), + /// Whitespace in markup or math. Has at most one newline in markup, as more + /// indicate a paragraph break. + Space(Space), + /// A forced line break: `\`. + Linebreak(Linebreak), + /// A paragraph break, indicated by one or multiple blank lines. + Parbreak(Parbreak), + /// An escape sequence: `\#`, `\u{1F5FA}`. + Escape(Escape), + /// A shorthand for a unicode codepoint. For example, `~` for non-breaking + /// space or `-?` for a soft hyphen. + Shorthand(Shorthand), + /// A smart quote: `'` or `"`. + SmartQuote(SmartQuote), + /// Strong content: `*Strong*`. + Strong(Strong), + /// Emphasized content: `_Emphasized_`. + Emph(Emph), + /// Raw text with optional syntax highlighting: `` `...` ``. + Raw(Raw), + /// A hyperlink: `https://typst.org`. + Link(Link), + /// A label: `<intro>`. + Label(Label), + /// A reference: `@target`, `@target[..]`. + Ref(Ref), + /// A section heading: `= Introduction`. + Heading(Heading), + /// An item in a bullet list: `- ...`. + List(ListItem), + /// An item in an enumeration (numbered list): `+ ...` or `1. ...`. + Enum(EnumItem), + /// An item in a term list: `/ Term: Details`. + Term(TermItem), + /// A mathematical equation: `$x$`, `$ x^2 $`. + Equation(Equation), + /// The contents of a mathematical equation: `x^2 + 1`. + Math(Math), + /// An identifier in math: `pi`. + MathIdent(MathIdent), + /// An alignment point in math: `&`. + MathAlignPoint(MathAlignPoint), + /// Matched delimiters in math: `[x + y]`. + MathDelimited(MathDelimited), + /// A base with optional attachments in math: `a_1^2`. + MathAttach(MathAttach), + /// Grouped math primes + MathPrimes(MathPrimes), + /// A fraction in math: `x/2`. + MathFrac(MathFrac), + /// A root in math: `√x`, `∛x` or `∜x`. + MathRoot(MathRoot), + /// An identifier: `left`. + Ident(Ident), + /// The `none` literal. + None(None), + /// The `auto` literal. + Auto(Auto), + /// A boolean: `true`, `false`. + Bool(Bool), + /// An integer: `120`. + Int(Int), + /// A floating-point number: `1.2`, `10e-4`. + Float(Float), + /// A numeric value with a unit: `12pt`, `3cm`, `2em`, `90deg`, `50%`. + Numeric(Numeric), + /// A quoted string: `"..."`. + Str(Str), + /// A code block: `{ let x = 1; x + 2 }`. + Code(CodeBlock), + /// A content block: `[*Hi* there!]`. + Content(ContentBlock), + /// A grouped expression: `(1 + 2)`. + Parenthesized(Parenthesized), + /// An array: `(1, "hi", 12cm)`. + Array(Array), + /// A dictionary: `(thickness: 3pt, pattern: dashed)`. + Dict(Dict), + /// A unary operation: `-x`. + Unary(Unary), + /// A binary operation: `a + b`. + Binary(Binary), + /// A field access: `properties.age`. + FieldAccess(FieldAccess), + /// An invocation of a function or method: `f(x, y)`. + FuncCall(FuncCall), + /// A closure: `(x, y) => z`. + Closure(Closure), + /// A let binding: `let x = 1`. + Let(LetBinding), + //// A destructuring assignment: `(x, y) = (1, 2)`. + DestructAssign(DestructAssignment), + /// A set rule: `set text(...)`. + Set(SetRule), + /// A show rule: `show heading: it => emph(it.body)`. + Show(ShowRule), + /// An if-else conditional: `if x { y } else { z }`. + Conditional(Conditional), + /// A while loop: `while x { y }`. + While(WhileLoop), + /// A for loop: `for x in y { z }`. + For(ForLoop), + /// A module import: `import "utils.typ": a, b, c`. + Import(ModuleImport), + /// A module include: `include "chapter1.typ"`. + Include(ModuleInclude), + /// A break from a loop: `break`. + Break(LoopBreak), + /// A continue in a loop: `continue`. + Continue(LoopContinue), + /// A return from a function: `return`, `return x + 1`. + Return(FuncReturn), +} + +impl Expr { + fn cast_with_space(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Space => node.cast().map(Self::Space), + _ => Self::from_untyped(node), + } + } +} + +impl AstNode for Expr { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Linebreak => node.cast().map(Self::Linebreak), + SyntaxKind::Parbreak => node.cast().map(Self::Parbreak), + SyntaxKind::Text => node.cast().map(Self::Text), + SyntaxKind::Escape => node.cast().map(Self::Escape), + SyntaxKind::Shorthand => node.cast().map(Self::Shorthand), + SyntaxKind::SmartQuote => node.cast().map(Self::SmartQuote), + SyntaxKind::Strong => node.cast().map(Self::Strong), + SyntaxKind::Emph => node.cast().map(Self::Emph), + SyntaxKind::Raw => node.cast().map(Self::Raw), + SyntaxKind::Link => node.cast().map(Self::Link), + SyntaxKind::Label => node.cast().map(Self::Label), + SyntaxKind::Ref => node.cast().map(Self::Ref), + SyntaxKind::Heading => node.cast().map(Self::Heading), + SyntaxKind::ListItem => node.cast().map(Self::List), + SyntaxKind::EnumItem => node.cast().map(Self::Enum), + SyntaxKind::TermItem => node.cast().map(Self::Term), + SyntaxKind::Equation => node.cast().map(Self::Equation), + SyntaxKind::Math => node.cast().map(Self::Math), + SyntaxKind::MathIdent => node.cast().map(Self::MathIdent), + SyntaxKind::MathAlignPoint => node.cast().map(Self::MathAlignPoint), + SyntaxKind::MathDelimited => node.cast().map(Self::MathDelimited), + SyntaxKind::MathAttach => node.cast().map(Self::MathAttach), + SyntaxKind::MathPrimes => node.cast().map(Self::MathPrimes), + SyntaxKind::MathFrac => node.cast().map(Self::MathFrac), + SyntaxKind::MathRoot => node.cast().map(Self::MathRoot), + SyntaxKind::Ident => node.cast().map(Self::Ident), + SyntaxKind::None => node.cast().map(Self::None), + SyntaxKind::Auto => node.cast().map(Self::Auto), + SyntaxKind::Bool => node.cast().map(Self::Bool), + SyntaxKind::Int => node.cast().map(Self::Int), + SyntaxKind::Float => node.cast().map(Self::Float), + SyntaxKind::Numeric => node.cast().map(Self::Numeric), + SyntaxKind::Str => node.cast().map(Self::Str), + SyntaxKind::CodeBlock => node.cast().map(Self::Code), + SyntaxKind::ContentBlock => node.cast().map(Self::Content), + SyntaxKind::Parenthesized => node.cast().map(Self::Parenthesized), + SyntaxKind::Array => node.cast().map(Self::Array), + SyntaxKind::Dict => node.cast().map(Self::Dict), + SyntaxKind::Unary => node.cast().map(Self::Unary), + SyntaxKind::Binary => node.cast().map(Self::Binary), + SyntaxKind::FieldAccess => node.cast().map(Self::FieldAccess), + SyntaxKind::FuncCall => node.cast().map(Self::FuncCall), + SyntaxKind::Closure => node.cast().map(Self::Closure), + SyntaxKind::LetBinding => node.cast().map(Self::Let), + SyntaxKind::DestructAssignment => node.cast().map(Self::DestructAssign), + SyntaxKind::SetRule => node.cast().map(Self::Set), + SyntaxKind::ShowRule => node.cast().map(Self::Show), + SyntaxKind::Conditional => node.cast().map(Self::Conditional), + SyntaxKind::WhileLoop => node.cast().map(Self::While), + SyntaxKind::ForLoop => node.cast().map(Self::For), + SyntaxKind::ModuleImport => node.cast().map(Self::Import), + SyntaxKind::ModuleInclude => node.cast().map(Self::Include), + SyntaxKind::LoopBreak => node.cast().map(Self::Break), + SyntaxKind::LoopContinue => node.cast().map(Self::Continue), + SyntaxKind::FuncReturn => node.cast().map(Self::Return), + _ => Option::None, + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Text(v) => v.as_untyped(), + Self::Space(v) => v.as_untyped(), + Self::Linebreak(v) => v.as_untyped(), + Self::Parbreak(v) => v.as_untyped(), + Self::Escape(v) => v.as_untyped(), + Self::Shorthand(v) => v.as_untyped(), + Self::SmartQuote(v) => v.as_untyped(), + Self::Strong(v) => v.as_untyped(), + Self::Emph(v) => v.as_untyped(), + Self::Raw(v) => v.as_untyped(), + Self::Link(v) => v.as_untyped(), + Self::Label(v) => v.as_untyped(), + Self::Ref(v) => v.as_untyped(), + Self::Heading(v) => v.as_untyped(), + Self::List(v) => v.as_untyped(), + Self::Enum(v) => v.as_untyped(), + Self::Term(v) => v.as_untyped(), + Self::Equation(v) => v.as_untyped(), + Self::Math(v) => v.as_untyped(), + Self::MathIdent(v) => v.as_untyped(), + Self::MathAlignPoint(v) => v.as_untyped(), + Self::MathDelimited(v) => v.as_untyped(), + Self::MathAttach(v) => v.as_untyped(), + Self::MathPrimes(v) => v.as_untyped(), + Self::MathFrac(v) => v.as_untyped(), + Self::MathRoot(v) => v.as_untyped(), + Self::Ident(v) => v.as_untyped(), + Self::None(v) => v.as_untyped(), + Self::Auto(v) => v.as_untyped(), + Self::Bool(v) => v.as_untyped(), + Self::Int(v) => v.as_untyped(), + Self::Float(v) => v.as_untyped(), + Self::Numeric(v) => v.as_untyped(), + Self::Str(v) => v.as_untyped(), + Self::Code(v) => v.as_untyped(), + Self::Content(v) => v.as_untyped(), + Self::Array(v) => v.as_untyped(), + Self::Dict(v) => v.as_untyped(), + Self::Parenthesized(v) => v.as_untyped(), + Self::Unary(v) => v.as_untyped(), + Self::Binary(v) => v.as_untyped(), + Self::FieldAccess(v) => v.as_untyped(), + Self::FuncCall(v) => v.as_untyped(), + Self::Closure(v) => v.as_untyped(), + Self::Let(v) => v.as_untyped(), + Self::DestructAssign(v) => v.as_untyped(), + Self::Set(v) => v.as_untyped(), + Self::Show(v) => v.as_untyped(), + Self::Conditional(v) => v.as_untyped(), + Self::While(v) => v.as_untyped(), + Self::For(v) => v.as_untyped(), + Self::Import(v) => v.as_untyped(), + Self::Include(v) => v.as_untyped(), + Self::Break(v) => v.as_untyped(), + Self::Continue(v) => v.as_untyped(), + Self::Return(v) => v.as_untyped(), + } + } +} + +impl Expr { + /// Can this expression be embedded into markup with a hashtag? + pub fn hashtag(&self) -> bool { + matches!( + self, + Self::Ident(_) + | Self::None(_) + | Self::Auto(_) + | Self::Bool(_) + | Self::Int(_) + | Self::Float(_) + | Self::Numeric(_) + | Self::Str(_) + | Self::Code(_) + | Self::Content(_) + | Self::Array(_) + | Self::Dict(_) + | Self::Parenthesized(_) + | Self::FieldAccess(_) + | Self::FuncCall(_) + | Self::Let(_) + | Self::Set(_) + | Self::Show(_) + | Self::Conditional(_) + | Self::While(_) + | Self::For(_) + | Self::Import(_) + | Self::Include(_) + | Self::Break(_) + | Self::Continue(_) + | Self::Return(_) + ) + } + + /// Is this a literal? + pub fn is_literal(&self) -> bool { + matches!( + self, + Self::None(_) + | Self::Auto(_) + | Self::Bool(_) + | Self::Int(_) + | Self::Float(_) + | Self::Numeric(_) + | Self::Str(_) + ) + } +} + +impl Default for Expr { + fn default() -> Self { + Expr::Space(Space::default()) + } +} + +node! { + /// Plain text without markup. + Text +} + +impl Text { + /// Get the text. + pub fn get(&self) -> &EcoString { + self.0.text() + } +} + +node! { + /// Whitespace in markup or math. Has at most one newline in markup, as more + /// indicate a paragraph break. + Space +} + +node! { + /// A forced line break: `\`. + Linebreak +} + +node! { + /// A paragraph break, indicated by one or multiple blank lines. + Parbreak +} + +node! { + /// An escape sequence: `\#`, `\u{1F5FA}`. + Escape +} + +impl Escape { + /// Get the escaped character. + pub fn get(&self) -> char { + let mut s = Scanner::new(self.0.text()); + s.expect('\\'); + if s.eat_if("u{") { + let hex = s.eat_while(char::is_ascii_hexdigit); + u32::from_str_radix(hex, 16) + .ok() + .and_then(std::char::from_u32) + .unwrap_or_default() + } else { + s.eat().unwrap_or_default() + } + } +} + +node! { + /// A shorthand for a unicode codepoint. For example, `~` for a non-breaking + /// space or `-?` for a soft hyphen. + Shorthand +} + +impl Shorthand { + /// A list of all shorthands. + pub const LIST: &[(&'static str, char)] = &[ + // Both. + ("...", '…'), + // Text only. + ("~", '\u{00A0}'), + ("--", '\u{2013}'), + ("---", '\u{2014}'), + ("-?", '\u{00AD}'), + // Math only. + ("-", '\u{2212}'), + ("'", '′'), + ("*", '∗'), + ("!=", '≠'), + (":=", '≔'), + ("::=", '⩴'), + ("=:", '≕'), + ("<<", '≪'), + ("<<<", '⋘'), + (">>", '≫'), + (">>>", '⋙'), + ("<=", '≤'), + (">=", '≥'), + ("->", '→'), + ("-->", '⟶'), + ("|->", '↦'), + (">->", '↣'), + ("->>", '↠'), + ("<-", '←'), + ("<--", '⟵'), + ("<-<", '↢'), + ("<<-", '↞'), + ("<->", '↔'), + ("<-->", '⟷'), + ("~>", '⇝'), + ("~~>", '⟿'), + ("<~", '⇜'), + ("<~~", '⬳'), + ("=>", '⇒'), + ("|=>", '⤇'), + ("==>", '⟹'), + ("<==", '⟸'), + ("<=>", '⇔'), + ("<==>", '⟺'), + ("[|", '⟦'), + ("|]", '⟧'), + ("||", '‖'), + ]; + + /// Get the shorthanded character. + pub fn get(&self) -> char { + let text = self.0.text(); + Self::LIST + .iter() + .find(|&&(s, _)| s == text) + .map_or_else(char::default, |&(_, c)| c) + } +} + +node! { + /// A smart quote: `'` or `"`. + SmartQuote +} + +impl SmartQuote { + /// Whether this is a double quote. + pub fn double(&self) -> bool { + self.0.text() == "\"" + } +} + +node! { + /// Strong content: `*Strong*`. + Strong +} + +impl Strong { + /// The contents of the strong node. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// Emphasized content: `_Emphasized_`. + Emph +} + +impl Emph { + /// The contents of the emphasis node. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// Raw text with optional syntax highlighting: `` `...` ``. + Raw +} + +impl Raw { + /// The trimmed raw text. + pub fn text(&self) -> EcoString { + let mut text = self.0.text().as_str(); + let blocky = text.starts_with("```"); + text = text.trim_matches('`'); + + // Trim tag, one space at the start, and one space at the end if the + // last non-whitespace char is a backtick. + if blocky { + let mut s = Scanner::new(text); + if s.eat_if(is_id_start) { + s.eat_while(is_id_continue); + } + text = s.after(); + text = text.strip_prefix(' ').unwrap_or(text); + if text.trim_end().ends_with('`') { + text = text.strip_suffix(' ').unwrap_or(text); + } + } + + // Split into lines. + let mut lines = split_newlines(text); + + if blocky { + let dedent = lines + .iter() + .skip(1) + .filter(|line| !line.chars().all(char::is_whitespace)) + // The line with the closing ``` is always taken into account + .chain(lines.last()) + .map(|line| line.chars().take_while(|c| c.is_whitespace()).count()) + .min() + .unwrap_or(0); + + // Dedent based on column, but not for the first line. + for line in lines.iter_mut().skip(1) { + let offset = line.chars().take(dedent).map(char::len_utf8).sum(); + *line = &line[offset..]; + } + + let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); + + // Trims a sequence of whitespace followed by a newline at the start. + if lines.first().map_or(false, is_whitespace) { + lines.remove(0); + } + + // Trims a newline followed by a sequence of whitespace at the end. + if lines.last().map_or(false, is_whitespace) { + lines.pop(); + } + } + + lines.join("\n").into() + } + + /// An optional identifier specifying the language to syntax-highlight in. + pub fn lang(&self) -> Option<&str> { + let text = self.0.text(); + + // Only blocky literals are supposed to contain a language. + if !text.starts_with("```") { + return Option::None; + } + + let inner = text.trim_start_matches('`'); + let mut s = Scanner::new(inner); + s.eat_if(is_id_start).then(|| { + s.eat_while(is_id_continue); + s.before() + }) + } + + /// Whether the raw text should be displayed in a separate block. + pub fn block(&self) -> bool { + let text = self.0.text(); + text.starts_with("```") && text.chars().any(is_newline) + } +} + +node! { + /// A hyperlink: `https://typst.org`. + Link +} + +impl Link { + /// Get the URL. + pub fn get(&self) -> &EcoString { + self.0.text() + } +} + +node! { + /// A label: `<intro>`. + Label +} + +impl Label { + /// Get the label's text. + pub fn get(&self) -> &str { + self.0.text().trim_start_matches('<').trim_end_matches('>') + } +} + +node! { + /// A reference: `@target`, `@target[..]`. + Ref +} + +impl Ref { + /// Get the target. + pub fn target(&self) -> &str { + self.0 + .children() + .find(|node| node.kind() == SyntaxKind::RefMarker) + .map(|node| node.text().trim_start_matches('@')) + .unwrap_or_default() + } + + /// Get the supplement. + pub fn supplement(&self) -> Option<ContentBlock> { + self.0.cast_last_match() + } +} + +node! { + /// A section heading: `= Introduction`. + Heading +} + +impl Heading { + /// The contents of the heading. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } + + /// The section depth (number of equals signs). + pub fn level(&self) -> NonZeroUsize { + self.0 + .children() + .find(|node| node.kind() == SyntaxKind::HeadingMarker) + .and_then(|node| node.len().try_into().ok()) + .unwrap_or(NonZeroUsize::new(1).unwrap()) + } +} + +node! { + /// An item in a bullet list: `- ...`. + ListItem +} + +impl ListItem { + /// The contents of the list item. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// An item in an enumeration (numbered list): `+ ...` or `1. ...`. + EnumItem +} + +impl EnumItem { + /// The explicit numbering, if any: `23.`. + pub fn number(&self) -> Option<usize> { + self.0.children().find_map(|node| match node.kind() { + SyntaxKind::EnumMarker => node.text().trim_end_matches('.').parse().ok(), + _ => Option::None, + }) + } + + /// The contents of the list item. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// An item in a term list: `/ Term: Details`. + TermItem +} + +impl TermItem { + /// The term described by the item. + pub fn term(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } + + /// The description of the term. + pub fn description(&self) -> Markup { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A mathemathical equation: `$x$`, `$ x^2 $`. + Equation +} + +impl Equation { + /// The contained math. + pub fn body(&self) -> Math { + self.0.cast_first_match().unwrap_or_default() + } + + /// Whether the equation should be displayed as a separate block. + pub fn block(&self) -> bool { + let is_space = |node: Option<&SyntaxNode>| { + node.map(SyntaxNode::kind) == Some(SyntaxKind::Space) + }; + is_space(self.0.children().nth(1)) && is_space(self.0.children().nth_back(1)) + } +} + +node! { + /// The contents of a mathematical equation: `x^2 + 1`. + Math +} + +impl Math { + /// The expressions the mathematical content consists of. + pub fn exprs(&self) -> impl DoubleEndedIterator<Item = Expr> + '_ { + self.0.children().filter_map(Expr::cast_with_space) + } +} + +node! { + /// An identifier in math: `pi`. + MathIdent +} + +impl MathIdent { + /// Get the identifier. + pub fn get(&self) -> &EcoString { + self.0.text() + } + + /// Take out the contained identifier. + pub fn take(self) -> EcoString { + self.0.into_text() + } + + /// Get the identifier as a string slice. + pub fn as_str(&self) -> &str { + self.get() + } +} + +impl Deref for MathIdent { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +node! { + /// An alignment point in math: `&`. + MathAlignPoint +} + +node! { + /// Matched delimiters in math: `[x + y]`. + MathDelimited +} + +impl MathDelimited { + /// The opening delimiter. + pub fn open(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The contents, including the delimiters. + pub fn body(&self) -> Math { + self.0.cast_first_match().unwrap_or_default() + } + + /// The closing delimiter. + pub fn close(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A base with optional attachments in math: `a_1^2`. + MathAttach +} + +impl MathAttach { + /// The base, to which things are attached. + pub fn base(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The bottom attachment. + pub fn bottom(&self) -> Option<Expr> { + self.0 + .children() + .skip_while(|node| !matches!(node.kind(), SyntaxKind::Underscore)) + .find_map(SyntaxNode::cast) + } + + /// The top attachment. + pub fn top(&self) -> Option<Expr> { + self.0 + .children() + .skip_while(|node| !matches!(node.kind(), SyntaxKind::Hat)) + .find_map(SyntaxNode::cast) + } + + /// Extract primes if present. + pub fn primes(&self) -> Option<MathPrimes> { + self.0.cast_first_match() + } +} + +node! { + /// Grouped primes in math: `a'''`. + MathPrimes +} + +impl MathPrimes { + pub fn count(&self) -> usize { + self.0 + .children() + .filter(|node| matches!(node.kind(), SyntaxKind::Prime)) + .count() + } +} + +node! { + /// A fraction in math: `x/2` + MathFrac +} + +impl MathFrac { + /// The numerator. + pub fn num(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The denominator. + pub fn denom(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A root in math: `√x`, `∛x` or `∜x`. + MathRoot +} + +impl MathRoot { + /// The index of the root. + pub fn index(&self) -> Option<usize> { + match self.0.children().next().map(|node| node.text().as_str()) { + Some("∜") => Some(4), + Some("∛") => Some(3), + Some("√") => Option::None, + _ => Option::None, + } + } + + /// The radicand. + pub fn radicand(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// An identifier: `it`. + Ident +} + +impl Ident { + /// Get the identifier. + pub fn get(&self) -> &EcoString { + self.0.text() + } + + /// Take out the contained identifier. + pub fn take(self) -> EcoString { + self.0.into_text() + } + + /// Get the identifier as a string slice. + pub fn as_str(&self) -> &str { + self.get() + } +} + +impl Deref for Ident { + type Target = str; + + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +node! { + /// The `none` literal. + None +} + +node! { + /// The `auto` literal. + Auto +} + +node! { + /// A boolean: `true`, `false`. + Bool +} + +impl Bool { + /// Get the boolean value. + pub fn get(&self) -> bool { + self.0.text() == "true" + } +} + +node! { + /// An integer: `120`. + Int +} + +impl Int { + /// Get the integer value. + pub fn get(&self) -> i64 { + let text = self.0.text(); + if let Some(rest) = text.strip_prefix("0x") { + i64::from_str_radix(rest, 16) + } else if let Some(rest) = text.strip_prefix("0o") { + i64::from_str_radix(rest, 8) + } else if let Some(rest) = text.strip_prefix("0b") { + i64::from_str_radix(rest, 2) + } else { + text.parse() + } + .unwrap_or_default() + } +} + +node! { + /// A floating-point number: `1.2`, `10e-4`. + Float +} + +impl Float { + /// Get the floating-point value. + pub fn get(&self) -> f64 { + self.0.text().parse().unwrap_or_default() + } +} + +node! { + /// A numeric value with a unit: `12pt`, `3cm`, `2em`, `90deg`, `50%`. + Numeric +} + +impl Numeric { + /// Get the numeric value and unit. + pub fn get(&self) -> (f64, Unit) { + let text = self.0.text(); + let count = text + .chars() + .rev() + .take_while(|c| matches!(c, 'a'..='z' | '%')) + .count(); + + let split = text.len() - count; + let value = text[..split].parse().unwrap_or_default(); + let unit = match &text[split..] { + "pt" => Unit::Pt, + "mm" => Unit::Mm, + "cm" => Unit::Cm, + "in" => Unit::In, + "deg" => Unit::Deg, + "rad" => Unit::Rad, + "em" => Unit::Em, + "fr" => Unit::Fr, + "%" => Unit::Percent, + _ => Unit::Percent, + }; + + (value, unit) + } +} + +/// Unit of a numeric value. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum Unit { + /// Points. + Pt, + /// Millimeters. + Mm, + /// Centimeters. + Cm, + /// Inches. + In, + /// Radians. + Rad, + /// Degrees. + Deg, + /// Font-relative: `1em` is the same as the font size. + Em, + /// Fractions: `fr`. + Fr, + /// Percentage: `%`. + Percent, +} + +node! { + /// A quoted string: `"..."`. + Str +} + +impl Str { + /// Get the string value with resolved escape sequences. + pub fn get(&self) -> EcoString { + let text = self.0.text(); + let unquoted = &text[1..text.len() - 1]; + if !unquoted.contains('\\') { + return unquoted.into(); + } + + let mut out = EcoString::with_capacity(unquoted.len()); + let mut s = Scanner::new(unquoted); + + while let Some(c) = s.eat() { + if c != '\\' { + out.push(c); + continue; + } + + let start = s.locate(-1); + match s.eat() { + Some('\\') => out.push('\\'), + Some('"') => out.push('"'), + Some('n') => out.push('\n'), + Some('r') => out.push('\r'), + Some('t') => out.push('\t'), + Some('u') if s.eat_if('{') => { + let sequence = s.eat_while(char::is_ascii_hexdigit); + s.eat_if('}'); + + match u32::from_str_radix(sequence, 16) + .ok() + .and_then(std::char::from_u32) + { + Some(c) => out.push(c), + Option::None => out.push_str(s.from(start)), + } + } + _ => out.push_str(s.from(start)), + } + } + + out + } +} + +node! { + /// A code block: `{ let x = 1; x + 2 }`. + CodeBlock +} + +impl CodeBlock { + /// The contained code. + pub fn body(&self) -> Code { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// Code. + Code +} + +impl Code { + /// The list of expressions contained in the code. + pub fn exprs(&self) -> impl DoubleEndedIterator<Item = Expr> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } +} + +node! { + /// A content block: `[*Hi* there!]`. + ContentBlock +} + +impl ContentBlock { + /// The contained markup. + pub fn body(&self) -> Markup { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// A grouped expression: `(1 + 2)`. + Parenthesized +} + +impl Parenthesized { + /// The wrapped expression. + pub fn expr(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } +} + +node! { + /// An array: `(1, "hi", 12cm)`. + Array +} + +impl Array { + /// The array's items. + pub fn items(&self) -> impl DoubleEndedIterator<Item = ArrayItem> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } +} + +/// An item in an array. +#[derive(Debug, Clone, Hash)] +pub enum ArrayItem { + /// A bare expression: `12`. + Pos(Expr), + /// A spread expression: `..things`. + Spread(Expr), +} + +impl AstNode for ArrayItem { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Spread => node.cast_first_match().map(Self::Spread), + _ => node.cast().map(Self::Pos), + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Pos(v) => v.as_untyped(), + Self::Spread(v) => v.as_untyped(), + } + } +} + +node! { + /// A dictionary: `(thickness: 3pt, pattern: dashed)`. + Dict +} + +impl Dict { + /// The dictionary's items. + pub fn items(&self) -> impl DoubleEndedIterator<Item = DictItem> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } +} + +/// An item in an dictionary expression. +#[derive(Debug, Clone, Hash)] +pub enum DictItem { + /// A named pair: `thickness: 3pt`. + Named(Named), + /// A keyed pair: `"spacy key": true`. + Keyed(Keyed), + /// A spread expression: `..things`. + Spread(Expr), +} + +impl AstNode for DictItem { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Named => node.cast().map(Self::Named), + SyntaxKind::Keyed => node.cast().map(Self::Keyed), + SyntaxKind::Spread => node.cast_first_match().map(Self::Spread), + _ => Option::None, + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Named(v) => v.as_untyped(), + Self::Keyed(v) => v.as_untyped(), + Self::Spread(v) => v.as_untyped(), + } + } +} + +node! { + /// A named pair: `thickness: 3pt`. + Named +} + +impl Named { + /// The name: `thickness`. + pub fn name(&self) -> Ident { + self.0.cast_first_match().unwrap_or_default() + } + + /// The right-hand side of the pair: `3pt`. + pub fn expr(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } + + /// The right-hand side of the pair as an identifier. + pub fn expr_ident(&self) -> Option<Ident> { + self.0.cast_last_match() + } +} + +node! { + /// A keyed pair: `"spacy key": true`. + Keyed +} + +impl Keyed { + /// The key: `"spacy key"`. + pub fn key(&self) -> Str { + self.0 + .children() + .find_map(|node| node.cast::<Str>()) + .unwrap_or_default() + } + + /// The right-hand side of the pair: `true`. + pub fn expr(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A unary operation: `-x`. + Unary +} + +impl Unary { + /// The operator: `-`. + pub fn op(&self) -> UnOp { + self.0 + .children() + .find_map(|node| UnOp::from_kind(node.kind())) + .unwrap_or(UnOp::Pos) + } + + /// The expression to operate on: `x`. + pub fn expr(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +/// A unary operator. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum UnOp { + /// The plus operator: `+`. + Pos, + /// The negation operator: `-`. + Neg, + /// The boolean `not`. + Not, +} + +impl UnOp { + /// Try to convert the token into a unary operation. + pub fn from_kind(token: SyntaxKind) -> Option<Self> { + Some(match token { + SyntaxKind::Plus => Self::Pos, + SyntaxKind::Minus => Self::Neg, + SyntaxKind::Not => Self::Not, + _ => return Option::None, + }) + } + + /// The precedence of this operator. + pub fn precedence(self) -> usize { + match self { + Self::Pos | Self::Neg => 7, + Self::Not => 4, + } + } + + /// The string representation of this operation. + pub fn as_str(self) -> &'static str { + match self { + Self::Pos => "+", + Self::Neg => "-", + Self::Not => "not", + } + } +} + +node! { + /// A binary operation: `a + b`. + Binary +} + +impl Binary { + /// The binary operator: `+`. + pub fn op(&self) -> BinOp { + let mut not = false; + self.0 + .children() + .find_map(|node| match node.kind() { + SyntaxKind::Not => { + not = true; + Option::None + } + SyntaxKind::In if not => Some(BinOp::NotIn), + _ => BinOp::from_kind(node.kind()), + }) + .unwrap_or(BinOp::Add) + } + + /// The left-hand side of the operation: `a`. + pub fn lhs(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The right-hand side of the operation: `b`. + pub fn rhs(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +/// A binary operator. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum BinOp { + /// The addition operator: `+`. + Add, + /// The subtraction operator: `-`. + Sub, + /// The multiplication operator: `*`. + Mul, + /// The division operator: `/`. + Div, + /// The short-circuiting boolean `and`. + And, + /// The short-circuiting boolean `or`. + Or, + /// The equality operator: `==`. + Eq, + /// The inequality operator: `!=`. + Neq, + /// The less-than operator: `<`. + Lt, + /// The less-than or equal operator: `<=`. + Leq, + /// The greater-than operator: `>`. + Gt, + /// The greater-than or equal operator: `>=`. + Geq, + /// The assignment operator: `=`. + Assign, + /// The containment operator: `in`. + In, + /// The inversed containment operator: `not in`. + NotIn, + /// The add-assign operator: `+=`. + AddAssign, + /// The subtract-assign oeprator: `-=`. + SubAssign, + /// The multiply-assign operator: `*=`. + MulAssign, + /// The divide-assign operator: `/=`. + DivAssign, +} + +impl BinOp { + /// Try to convert the token into a binary operation. + pub fn from_kind(token: SyntaxKind) -> Option<Self> { + Some(match token { + SyntaxKind::Plus => Self::Add, + SyntaxKind::Minus => Self::Sub, + SyntaxKind::Star => Self::Mul, + SyntaxKind::Slash => Self::Div, + SyntaxKind::And => Self::And, + SyntaxKind::Or => Self::Or, + SyntaxKind::EqEq => Self::Eq, + SyntaxKind::ExclEq => Self::Neq, + SyntaxKind::Lt => Self::Lt, + SyntaxKind::LtEq => Self::Leq, + SyntaxKind::Gt => Self::Gt, + SyntaxKind::GtEq => Self::Geq, + SyntaxKind::Eq => Self::Assign, + SyntaxKind::In => Self::In, + SyntaxKind::PlusEq => Self::AddAssign, + SyntaxKind::HyphEq => Self::SubAssign, + SyntaxKind::StarEq => Self::MulAssign, + SyntaxKind::SlashEq => Self::DivAssign, + _ => return Option::None, + }) + } + + /// The precedence of this operator. + pub fn precedence(self) -> usize { + match self { + Self::Mul => 6, + Self::Div => 6, + Self::Add => 5, + Self::Sub => 5, + Self::Eq => 4, + Self::Neq => 4, + Self::Lt => 4, + Self::Leq => 4, + Self::Gt => 4, + Self::Geq => 4, + Self::In => 4, + Self::NotIn => 4, + Self::And => 3, + Self::Or => 2, + Self::Assign => 1, + Self::AddAssign => 1, + Self::SubAssign => 1, + Self::MulAssign => 1, + Self::DivAssign => 1, + } + } + + /// The associativity of this operator. + pub fn assoc(self) -> Assoc { + match self { + Self::Add => Assoc::Left, + Self::Sub => Assoc::Left, + Self::Mul => Assoc::Left, + Self::Div => Assoc::Left, + Self::And => Assoc::Left, + Self::Or => Assoc::Left, + Self::Eq => Assoc::Left, + Self::Neq => Assoc::Left, + Self::Lt => Assoc::Left, + Self::Leq => Assoc::Left, + Self::Gt => Assoc::Left, + Self::Geq => Assoc::Left, + Self::In => Assoc::Left, + Self::NotIn => Assoc::Left, + Self::Assign => Assoc::Right, + Self::AddAssign => Assoc::Right, + Self::SubAssign => Assoc::Right, + Self::MulAssign => Assoc::Right, + Self::DivAssign => Assoc::Right, + } + } + + /// The string representation of this operation. + pub fn as_str(self) -> &'static str { + match self { + Self::Add => "+", + Self::Sub => "-", + Self::Mul => "*", + Self::Div => "/", + Self::And => "and", + Self::Or => "or", + Self::Eq => "==", + Self::Neq => "!=", + Self::Lt => "<", + Self::Leq => "<=", + Self::Gt => ">", + Self::Geq => ">=", + Self::In => "in", + Self::NotIn => "not in", + Self::Assign => "=", + Self::AddAssign => "+=", + Self::SubAssign => "-=", + Self::MulAssign => "*=", + Self::DivAssign => "/=", + } + } +} + +/// The associativity of a binary operator. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum Assoc { + /// Left-associative: `a + b + c` is equivalent to `(a + b) + c`. + Left, + /// Right-associative: `a = b = c` is equivalent to `a = (b = c)`. + Right, +} + +node! { + /// A field access: `properties.age`. + FieldAccess +} + +impl FieldAccess { + /// The expression to access the field on. + pub fn target(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The name of the field. + pub fn field(&self) -> Ident { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// An invocation of a function or method: `f(x, y)`. + FuncCall +} + +impl FuncCall { + /// The function to call. + pub fn callee(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The arguments to the function. + pub fn args(&self) -> Args { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A function call's argument list: `(12pt, y)`. + Args +} + +impl Args { + /// The positional and named arguments. + pub fn items(&self) -> impl DoubleEndedIterator<Item = Arg> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } +} + +/// An argument to a function call. +#[derive(Debug, Clone, Hash)] +pub enum Arg { + /// A positional argument: `12`. + Pos(Expr), + /// A named argument: `draw: false`. + Named(Named), + /// A spread argument: `..things`. + Spread(Expr), +} + +impl AstNode for Arg { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Named => node.cast().map(Self::Named), + SyntaxKind::Spread => node.cast_first_match().map(Self::Spread), + _ => node.cast().map(Self::Pos), + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Pos(v) => v.as_untyped(), + Self::Named(v) => v.as_untyped(), + Self::Spread(v) => v.as_untyped(), + } + } +} + +node! { + /// A closure: `(x, y) => z`. + Closure +} + +impl Closure { + /// The name of the closure. + /// + /// This only exists if you use the function syntax sugar: `let f(x) = y`. + pub fn name(&self) -> Option<Ident> { + self.0.children().next()?.cast() + } + + /// The parameter bindings. + pub fn params(&self) -> Params { + self.0.cast_first_match().unwrap_or_default() + } + + /// The body of the closure. + pub fn body(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A closure's parameters: `(x, y)`. + Params +} + +impl Params { + /// The parameter bindings. + pub fn children(&self) -> impl DoubleEndedIterator<Item = Param> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } +} + +node! { + /// A spread: `..x` or `..x.at(0)`. + Spread +} + +impl Spread { + /// Try to get an identifier. + pub fn name(&self) -> Option<Ident> { + self.0.cast_first_match() + } + + /// Try to get an expression. + pub fn expr(&self) -> Option<Expr> { + self.0.cast_first_match() + } +} + +node! { + /// An underscore: `_` + Underscore +} + +/// A parameter to a closure. +#[derive(Debug, Clone, Hash)] +pub enum Param { + /// A positional parameter: `x`. + Pos(Pattern), + /// A named parameter with a default value: `draw: false`. + Named(Named), + /// An argument sink: `..args`. + Sink(Spread), +} + +impl AstNode for Param { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Named => node.cast().map(Self::Named), + SyntaxKind::Spread => node.cast().map(Self::Sink), + _ => node.cast().map(Self::Pos), + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Pos(v) => v.as_untyped(), + Self::Named(v) => v.as_untyped(), + Self::Sink(v) => v.as_untyped(), + } + } +} + +node! { + /// A destructuring pattern: `x` or `(x, _, ..y)`. + Destructuring +} + +impl Destructuring { + /// The bindings of the destructuring. + pub fn bindings(&self) -> impl Iterator<Item = DestructuringKind> + '_ { + self.0.children().filter_map(SyntaxNode::cast) + } + + // Returns a list of all identifiers in the pattern. + pub fn idents(&self) -> impl Iterator<Item = Ident> + '_ { + self.bindings().filter_map(|binding| match binding { + DestructuringKind::Normal(Expr::Ident(ident)) => Some(ident), + DestructuringKind::Sink(spread) => spread.name(), + DestructuringKind::Named(named) => named.expr_ident(), + _ => Option::None, + }) + } +} + +/// The kind of an element in a destructuring pattern. +#[derive(Debug, Clone, Hash)] +pub enum DestructuringKind { + /// An expression: `x`. + Normal(Expr), + /// An argument sink: `..y`. + Sink(Spread), + /// Named arguments: `x: 1`. + Named(Named), + /// A placeholder: `_`. + Placeholder(Underscore), +} + +impl AstNode for DestructuringKind { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Named => node.cast().map(Self::Named), + SyntaxKind::Spread => node.cast().map(Self::Sink), + SyntaxKind::Underscore => node.cast().map(Self::Placeholder), + _ => node.cast().map(Self::Normal), + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Normal(v) => v.as_untyped(), + Self::Named(v) => v.as_untyped(), + Self::Sink(v) => v.as_untyped(), + Self::Placeholder(v) => v.as_untyped(), + } + } +} + +/// The kind of a pattern. +#[derive(Debug, Clone, Hash)] +pub enum Pattern { + /// A single expression: `x`. + Normal(Expr), + /// A placeholder: `_`. + Placeholder(Underscore), + /// A destructuring pattern: `(x, _, ..y)`. + Destructuring(Destructuring), +} + +impl AstNode for Pattern { + fn from_untyped(node: &SyntaxNode) -> Option<Self> { + match node.kind() { + SyntaxKind::Destructuring => node.cast().map(Self::Destructuring), + SyntaxKind::Underscore => node.cast().map(Self::Placeholder), + _ => node.cast().map(Self::Normal), + } + } + + fn as_untyped(&self) -> &SyntaxNode { + match self { + Self::Normal(v) => v.as_untyped(), + Self::Destructuring(v) => v.as_untyped(), + Self::Placeholder(v) => v.as_untyped(), + } + } +} + +impl Pattern { + // Returns a list of all identifiers in the pattern. + pub fn idents(&self) -> Vec<Ident> { + match self { + Pattern::Normal(Expr::Ident(ident)) => vec![ident.clone()], + Pattern::Destructuring(destruct) => destruct.idents().collect(), + _ => vec![], + } + } +} + +impl Default for Pattern { + fn default() -> Self { + Self::Normal(Expr::default()) + } +} + +node! { + /// A let binding: `let x = 1`. + LetBinding +} + +#[derive(Debug)] +pub enum LetBindingKind { + /// A normal binding: `let x = 1`. + Normal(Pattern), + /// A closure binding: `let f(x) = 1`. + Closure(Ident), +} + +impl LetBindingKind { + // Returns a list of all identifiers in the pattern. + pub fn idents(&self) -> Vec<Ident> { + match self { + LetBindingKind::Normal(pattern) => pattern.idents(), + LetBindingKind::Closure(ident) => { + vec![ident.clone()] + } + } + } +} + +impl LetBinding { + /// The kind of the let binding. + pub fn kind(&self) -> LetBindingKind { + match self.0.cast_first_match::<Pattern>() { + Some(Pattern::Normal(Expr::Closure(closure))) => { + LetBindingKind::Closure(closure.name().unwrap_or_default()) + } + pattern => LetBindingKind::Normal(pattern.unwrap_or_default()), + } + } + + /// The expression the binding is initialized with. + pub fn init(&self) -> Option<Expr> { + match self.kind() { + LetBindingKind::Normal(Pattern::Normal(_)) => { + self.0.children().filter_map(SyntaxNode::cast).nth(1) + } + LetBindingKind::Normal(_) => self.0.cast_first_match(), + LetBindingKind::Closure(_) => self.0.cast_first_match(), + } + } +} + +node! { + /// An assignment expression `(x, y) = (1, 2)`. + DestructAssignment +} + +impl DestructAssignment { + /// The pattern of the assignment. + pub fn pattern(&self) -> Pattern { + self.0.cast_first_match::<Pattern>().unwrap_or_default() + } + + /// The expression that is assigned. + pub fn value(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A set rule: `set text(...)`. + SetRule +} + +impl SetRule { + /// The function to set style properties for. + pub fn target(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The style properties to set. + pub fn args(&self) -> Args { + self.0.cast_last_match().unwrap_or_default() + } + + /// A condition under which the set rule applies. + pub fn condition(&self) -> Option<Expr> { + self.0 + .children() + .skip_while(|child| child.kind() != SyntaxKind::If) + .find_map(SyntaxNode::cast) + } +} + +node! { + /// A show rule: `show heading: it => emph(it.body)`. + ShowRule +} + +impl ShowRule { + /// Defines which nodes the show rule applies to. + pub fn selector(&self) -> Option<Expr> { + self.0 + .children() + .rev() + .skip_while(|child| child.kind() != SyntaxKind::Colon) + .find_map(SyntaxNode::cast) + } + + /// The transformation recipe. + pub fn transform(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// An if-else conditional: `if x { y } else { z }`. + Conditional +} + +impl Conditional { + /// The condition which selects the body to evaluate. + pub fn condition(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The expression to evaluate if the condition is true. + pub fn if_body(&self) -> Expr { + self.0 + .children() + .filter_map(SyntaxNode::cast) + .nth(1) + .unwrap_or_default() + } + + /// The expression to evaluate if the condition is false. + pub fn else_body(&self) -> Option<Expr> { + self.0.children().filter_map(SyntaxNode::cast).nth(2) + } +} + +node! { + /// A while loop: `while x { y }`. + WhileLoop +} + +impl WhileLoop { + /// The condition which selects whether to evaluate the body. + pub fn condition(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The expression to evaluate while the condition is true. + pub fn body(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A for loop: `for x in y { z }`. + ForLoop +} + +impl ForLoop { + /// The pattern to assign to. + pub fn pattern(&self) -> Pattern { + self.0.cast_first_match().unwrap_or_default() + } + + /// The expression to iterate over. + pub fn iter(&self) -> Expr { + self.0 + .children() + .skip_while(|&c| c.kind() != SyntaxKind::In) + .find_map(SyntaxNode::cast) + .unwrap_or_default() + } + + /// The expression to evaluate for each iteration. + pub fn body(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A module import: `import "utils.typ": a, b, c`. + ModuleImport +} + +impl ModuleImport { + /// The module or path from which the items should be imported. + pub fn source(&self) -> Expr { + self.0.cast_first_match().unwrap_or_default() + } + + /// The items to be imported. + pub fn imports(&self) -> Option<Imports> { + self.0.children().find_map(|node| match node.kind() { + SyntaxKind::Star => Some(Imports::Wildcard), + SyntaxKind::ImportItems => { + let items = node.children().filter_map(SyntaxNode::cast).collect(); + Some(Imports::Items(items)) + } + _ => Option::None, + }) + } +} + +/// The items that ought to be imported from a file. +#[derive(Debug, Clone, Hash)] +pub enum Imports { + /// All items in the scope of the file should be imported. + Wildcard, + /// The specified items from the file should be imported. + Items(Vec<Ident>), +} + +node! { + /// A module include: `include "chapter1.typ"`. + ModuleInclude +} + +impl ModuleInclude { + /// The module or path from which the content should be included. + pub fn source(&self) -> Expr { + self.0.cast_last_match().unwrap_or_default() + } +} + +node! { + /// A break from a loop: `break`. + LoopBreak +} + +node! { + /// A continue in a loop: `continue`. + LoopContinue +} + +node! { + /// A return from a function: `return`, `return x + 1`. + FuncReturn +} + +impl FuncReturn { + /// The expression to return. + pub fn body(&self) -> Option<Expr> { + self.0.cast_last_match() + } +} diff --git a/crates/typst-syntax/src/file.rs b/crates/typst-syntax/src/file.rs new file mode 100644 index 00000000..fc1bed21 --- /dev/null +++ b/crates/typst-syntax/src/file.rs @@ -0,0 +1,279 @@ +//! File and package management. + +use std::collections::HashMap; +use std::fmt::{self, Debug, Display, Formatter}; +use std::path::{Component, Path, PathBuf}; +use std::str::FromStr; +use std::sync::RwLock; + +use ecow::{eco_format, EcoString}; +use once_cell::sync::Lazy; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use super::is_ident; + +/// The global package-path interner. +static INTERNER: Lazy<RwLock<Interner>> = + Lazy::new(|| RwLock::new(Interner { to_id: HashMap::new(), from_id: Vec::new() })); + +/// A package-path interner. +struct Interner { + to_id: HashMap<Pair, FileId>, + from_id: Vec<Pair>, +} + +/// An interned pair of a package specification and a path. +type Pair = &'static (Option<PackageSpec>, PathBuf); + +/// Identifies a file in a project or package. +/// +/// This type is globally interned and thus cheap to copy, compare, and hash. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct FileId(u16); + +impl FileId { + /// Create a new interned file specification. + /// + /// The path must start with a `/` or this function will panic. + /// Note that the path is normalized before interning. + #[track_caller] + pub fn new(package: Option<PackageSpec>, path: &Path) -> Self { + assert_eq!( + path.components().next(), + Some(std::path::Component::RootDir), + "file path must be absolute within project or package: {}", + path.display(), + ); + + // Try to find an existing entry that we can reuse. + let pair = (package, normalize_path(path)); + if let Some(&id) = INTERNER.read().unwrap().to_id.get(&pair) { + return id; + } + + let mut interner = INTERNER.write().unwrap(); + let len = interner.from_id.len(); + if len >= usize::from(u16::MAX) { + panic!("too many file specifications"); + } + + // Create a new entry forever by leaking the pair. We can't leak more + // than 2^16 pair (and typically will leak a lot less), so its not a + // big deal. + let id = FileId(len as u16); + let leaked = Box::leak(Box::new(pair)); + interner.to_id.insert(leaked, id); + interner.from_id.push(leaked); + id + } + + /// Get an id that does not identify any real file. + pub const fn detached() -> Self { + Self(u16::MAX) + } + + /// Whether the id is the detached. + pub const fn is_detached(self) -> bool { + self.0 == Self::detached().0 + } + + /// The package the file resides in, if any. + pub fn package(&self) -> Option<&'static PackageSpec> { + if self.is_detached() { + None + } else { + self.pair().0.as_ref() + } + } + + /// The absolute and normalized path to the file _within_ the project or + /// package. + pub fn path(&self) -> &'static Path { + if self.is_detached() { + Path::new("/detached.typ") + } else { + &self.pair().1 + } + } + + /// Resolve a file location relative to this file. + pub fn join(self, path: &str) -> Result<Self, EcoString> { + if self.is_detached() { + Err("cannot access file system from here")?; + } + + let package = self.package().cloned(); + let base = self.path(); + Ok(if let Some(parent) = base.parent() { + Self::new(package, &parent.join(path)) + } else { + Self::new(package, Path::new(path)) + }) + } + + /// Construct from a raw number. + pub(crate) const fn from_u16(v: u16) -> Self { + Self(v) + } + + /// Extract the raw underlying number. + pub(crate) const fn as_u16(self) -> u16 { + self.0 + } + + /// Get the static pair. + fn pair(&self) -> Pair { + INTERNER.read().unwrap().from_id[usize::from(self.0)] + } +} + +impl Display for FileId { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let path = self.path().display(); + match self.package() { + Some(package) => write!(f, "{package}{path}"), + None => write!(f, "{path}"), + } + } +} + +impl Debug for FileId { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(self, f) + } +} + +/// Lexically normalize a path. +fn normalize_path(path: &Path) -> PathBuf { + let mut out = PathBuf::new(); + for component in path.components() { + match component { + Component::CurDir => {} + Component::ParentDir => match out.components().next_back() { + Some(Component::Normal(_)) => { + out.pop(); + } + _ => out.push(component), + }, + Component::Prefix(_) | Component::RootDir | Component::Normal(_) => { + out.push(component) + } + } + } + if out.as_os_str().is_empty() { + out.push(Component::CurDir); + } + out +} + +/// Identifies a package. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct PackageSpec { + /// The namespace the package lives in. + pub namespace: EcoString, + /// The name of the package within its namespace. + pub name: EcoString, + /// The package's version. + pub version: PackageVersion, +} + +impl FromStr for PackageSpec { + type Err = EcoString; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let mut s = unscanny::Scanner::new(s); + if !s.eat_if('@') { + Err("package specification must start with '@'")?; + } + + let namespace = s.eat_until('/'); + if namespace.is_empty() { + Err("package specification is missing namespace")?; + } else if !is_ident(namespace) { + Err(eco_format!("`{namespace}` is not a valid package namespace"))?; + } + + s.eat_if('/'); + + let name = s.eat_until(':'); + if name.is_empty() { + Err("package specification is missing name")?; + } else if !is_ident(name) { + Err(eco_format!("`{name}` is not a valid package name"))?; + } + + s.eat_if(':'); + + let version = s.after(); + if version.is_empty() { + Err("package specification is missing version")?; + } + + Ok(Self { + namespace: namespace.into(), + name: name.into(), + version: version.parse()?, + }) + } +} + +impl Display for PackageSpec { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "@{}/{}:{}", self.namespace, self.name, self.version) + } +} + +/// A package's version. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct PackageVersion { + /// The package's major version. + pub major: u32, + /// The package's minor version. + pub minor: u32, + /// The package's patch version. + pub patch: u32, +} + +impl FromStr for PackageVersion { + type Err = EcoString; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let mut parts = s.split('.'); + let mut next = |kind| { + let part = parts + .next() + .filter(|s| !s.is_empty()) + .ok_or_else(|| eco_format!("version number is missing {kind} version"))?; + part.parse::<u32>() + .map_err(|_| eco_format!("`{part}` is not a valid {kind} version")) + }; + + let major = next("major")?; + let minor = next("minor")?; + let patch = next("patch")?; + if let Some(rest) = parts.next() { + Err(eco_format!("version number has unexpected fourth component: `{rest}`"))?; + } + + Ok(Self { major, minor, patch }) + } +} + +impl Display for PackageVersion { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{}.{}.{}", self.major, self.minor, self.patch) + } +} + +impl Serialize for PackageVersion { + fn serialize<S: Serializer>(&self, s: S) -> Result<S::Ok, S::Error> { + s.collect_str(self) + } +} + +impl<'de> Deserialize<'de> for PackageVersion { + fn deserialize<D: Deserializer<'de>>(d: D) -> Result<Self, D::Error> { + let string = EcoString::deserialize(d)?; + string.parse().map_err(serde::de::Error::custom) + } +} diff --git a/crates/typst-syntax/src/kind.rs b/crates/typst-syntax/src/kind.rs new file mode 100644 index 00000000..49119720 --- /dev/null +++ b/crates/typst-syntax/src/kind.rs @@ -0,0 +1,480 @@ +/// A syntactical building block of a Typst file. +/// +/// Can be created by the lexer or by the parser. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +#[repr(u8)] +pub enum SyntaxKind { + /// Markup. + Markup, + /// Plain text without markup. + Text, + /// Whitespace. Contains at most one newline in markup, as more indicate a + /// paragraph break. + Space, + /// A forced line break: `\`. + Linebreak, + /// A paragraph break, indicated by one or multiple blank lines. + Parbreak, + /// An escape sequence: `\#`, `\u{1F5FA}`. + Escape, + /// A shorthand for a unicode codepoint. For example, `~` for non-breaking + /// space or `-?` for a soft hyphen. + Shorthand, + /// A smart quote: `'` or `"`. + SmartQuote, + /// Strong content: `*Strong*`. + Strong, + /// Emphasized content: `_Emphasized_`. + Emph, + /// Raw text with optional syntax highlighting: `` `...` ``. + Raw, + /// A hyperlink: `https://typst.org`. + Link, + /// A label: `<intro>`. + Label, + /// A reference: `@target`, `@target[..]`. + Ref, + /// Introduces a reference: `@target`. + RefMarker, + /// A section heading: `= Introduction`. + Heading, + /// Introduces a section heading: `=`, `==`, ... + HeadingMarker, + /// An item in a bullet list: `- ...`. + ListItem, + /// Introduces a list item: `-`. + ListMarker, + /// An item in an enumeration (numbered list): `+ ...` or `1. ...`. + EnumItem, + /// Introduces an enumeration item: `+`, `1.`. + EnumMarker, + /// An item in a term list: `/ Term: Details`. + TermItem, + /// Introduces a term item: `/`. + TermMarker, + /// A mathematical equation: `$x$`, `$ x^2 $`. + Equation, + + /// The contents of a mathematical equation: `x^2 + 1`. + Math, + /// An identifier in math: `pi`. + MathIdent, + /// An alignment point in math: `&`. + MathAlignPoint, + /// Matched delimiters in math: `[x + y]`. + MathDelimited, + /// A base with optional attachments in math: `a_1^2`. + MathAttach, + /// Grouped primes in math: `a'''`. + MathPrimes, + /// A fraction in math: `x/2`. + MathFrac, + /// A root in math: `√x`, `∛x` or `∜x`. + MathRoot, + + /// A hashtag that switches into code mode: `#`. + Hashtag, + /// A left curly brace, starting a code block: `{`. + LeftBrace, + /// A right curly brace, terminating a code block: `}`. + RightBrace, + /// A left square bracket, starting a content block: `[`. + LeftBracket, + /// A right square bracket, terminating a content block: `]`. + RightBracket, + /// A left round parenthesis, starting a grouped expression, collection, + /// argument or parameter list: `(`. + LeftParen, + /// A right round parenthesis, terminating a grouped expression, collection, + /// argument or parameter list: `)`. + RightParen, + /// A comma separator in a sequence: `,`. + Comma, + /// A semicolon terminating an expression: `;`. + Semicolon, + /// A colon between name/key and value in a dictionary, argument or + /// parameter list, or between the term and body of a term list term: `:`. + Colon, + /// The strong text toggle, multiplication operator, and wildcard import + /// symbol: `*`. + Star, + /// Toggles emphasized text and indicates a subscript in math: `_`. + Underscore, + /// Starts and ends a mathematical equation: `$`. + Dollar, + /// The unary plus and binary addition operator: `+`. + Plus, + /// The unary negation and binary subtraction operator: `-`. + Minus, + /// The division operator and fraction operator in math: `/`. + Slash, + /// The superscript operator in math: `^`. + Hat, + /// The prime in math: `'`. + Prime, + /// The field access and method call operator: `.`. + Dot, + /// The assignment operator: `=`. + Eq, + /// The equality operator: `==`. + EqEq, + /// The inequality operator: `!=`. + ExclEq, + /// The less-than operator: `<`. + Lt, + /// The less-than or equal operator: `<=`. + LtEq, + /// The greater-than operator: `>`. + Gt, + /// The greater-than or equal operator: `>=`. + GtEq, + /// The add-assign operator: `+=`. + PlusEq, + /// The subtract-assign operator: `-=`. + HyphEq, + /// The multiply-assign operator: `*=`. + StarEq, + /// The divide-assign operator: `/=`. + SlashEq, + /// The spread operator: `..`. + Dots, + /// An arrow between a closure's parameters and body: `=>`. + Arrow, + /// A root: `√`, `∛` or `∜`. + Root, + + /// The `not` operator. + Not, + /// The `and` operator. + And, + /// The `or` operator. + Or, + /// The `none` literal. + None, + /// The `auto` literal. + Auto, + /// The `let` keyword. + Let, + /// The `set` keyword. + Set, + /// The `show` keyword. + Show, + /// The `if` keyword. + If, + /// The `else` keyword. + Else, + /// The `for` keyword. + For, + /// The `in` keyword. + In, + /// The `while` keyword. + While, + /// The `break` keyword. + Break, + /// The `continue` keyword. + Continue, + /// The `return` keyword. + Return, + /// The `import` keyword. + Import, + /// The `include` keyword. + Include, + /// The `as` keyword. + As, + + /// Code. + Code, + /// An identifier: `it`. + Ident, + /// A boolean: `true`, `false`. + Bool, + /// An integer: `120`. + Int, + /// A floating-point number: `1.2`, `10e-4`. + Float, + /// A numeric value with a unit: `12pt`, `3cm`, `2em`, `90deg`, `50%`. + Numeric, + /// A quoted string: `"..."`. + Str, + /// A code block: `{ let x = 1; x + 2 }`. + CodeBlock, + /// A content block: `[*Hi* there!]`. + ContentBlock, + /// A grouped expression: `(1 + 2)`. + Parenthesized, + /// An array: `(1, "hi", 12cm)`. + Array, + /// A dictionary: `(thickness: 3pt, pattern: dashed)`. + Dict, + /// A named pair: `thickness: 3pt`. + Named, + /// A keyed pair: `"spacy key": true`. + Keyed, + /// A unary operation: `-x`. + Unary, + /// A binary operation: `a + b`. + Binary, + /// A field access: `properties.age`. + FieldAccess, + /// An invocation of a function or method: `f(x, y)`. + FuncCall, + /// A function call's argument list: `(12pt, y)`. + Args, + /// Spread arguments or an argument sink: `..x`. + Spread, + /// A closure: `(x, y) => z`. + Closure, + /// A closure's parameters: `(x, y)`. + Params, + /// A let binding: `let x = 1`. + LetBinding, + /// A set rule: `set text(...)`. + SetRule, + /// A show rule: `show heading: it => emph(it.body)`. + ShowRule, + /// An if-else conditional: `if x { y } else { z }`. + Conditional, + /// A while loop: `while x { y }`. + WhileLoop, + /// A for loop: `for x in y { z }`. + ForLoop, + /// A module import: `import "utils.typ": a, b, c`. + ModuleImport, + /// Items to import from a module: `a, b, c`. + ImportItems, + /// A module include: `include "chapter1.typ"`. + ModuleInclude, + /// A break from a loop: `break`. + LoopBreak, + /// A continue in a loop: `continue`. + LoopContinue, + /// A return from a function: `return`, `return x + 1`. + FuncReturn, + /// A destructuring pattern: `(x, _, ..y)`. + Destructuring, + /// A destructuring assignment expression: `(x, y) = (1, 2)`. + DestructAssignment, + + /// A line comment: `// ...`. + LineComment, + /// A block comment: `/* ... */`. + BlockComment, + /// An invalid sequence of characters. + Error, + /// The end of the file. + Eof, +} + +impl SyntaxKind { + /// Is this a bracket, brace, or parenthesis? + pub fn is_grouping(self) -> bool { + matches!( + self, + Self::LeftBracket + | Self::LeftBrace + | Self::LeftParen + | Self::RightBracket + | Self::RightBrace + | Self::RightParen + ) + } + + /// Does this node terminate a preceding expression? + pub fn is_terminator(self) -> bool { + matches!( + self, + Self::Eof + | Self::Semicolon + | Self::RightBrace + | Self::RightParen + | Self::RightBracket + ) + } + + /// Is this a code or content block. + pub fn is_block(self) -> bool { + matches!(self, Self::CodeBlock | Self::ContentBlock) + } + + /// Does this node need termination through a semicolon or linebreak? + pub fn is_stmt(self) -> bool { + matches!( + self, + Self::LetBinding + | Self::SetRule + | Self::ShowRule + | Self::ModuleImport + | Self::ModuleInclude + ) + } + + /// Is this node is a keyword. + pub fn is_keyword(self) -> bool { + matches!( + self, + Self::Not + | Self::And + | Self::Or + | Self::None + | Self::Auto + | Self::Let + | Self::Set + | Self::Show + | Self::If + | Self::Else + | Self::For + | Self::In + | Self::While + | Self::Break + | Self::Continue + | Self::Return + | Self::Import + | Self::Include + | Self::As + ) + } + + /// Whether this kind of node is automatically skipped by the parser in + /// code and math mode. + pub fn is_trivia(self) -> bool { + matches!( + self, + Self::Space | Self::Parbreak | Self::LineComment | Self::BlockComment + ) + } + + /// Whether this is an error. + pub fn is_error(self) -> bool { + self == Self::Error + } + + /// A human-readable name for the kind. + pub fn name(self) -> &'static str { + match self { + Self::Markup => "markup", + Self::Text => "text", + Self::Space => "space", + Self::Linebreak => "line break", + Self::Parbreak => "paragraph break", + Self::Escape => "escape sequence", + Self::Shorthand => "shorthand", + Self::SmartQuote => "smart quote", + Self::Strong => "strong content", + Self::Emph => "emphasized content", + Self::Raw => "raw block", + Self::Link => "link", + Self::Label => "label", + Self::Ref => "reference", + Self::RefMarker => "reference marker", + Self::Heading => "heading", + Self::HeadingMarker => "heading marker", + Self::ListItem => "list item", + Self::ListMarker => "list marker", + Self::EnumItem => "enum item", + Self::EnumMarker => "enum marker", + Self::TermItem => "term list item", + Self::TermMarker => "term marker", + Self::Equation => "equation", + Self::Math => "math", + Self::MathIdent => "math identifier", + Self::MathAlignPoint => "math alignment point", + Self::MathDelimited => "delimited math", + Self::MathAttach => "math attachments", + Self::MathFrac => "math fraction", + Self::MathRoot => "math root", + Self::MathPrimes => "math primes", + Self::Hashtag => "hashtag", + Self::LeftBrace => "opening brace", + Self::RightBrace => "closing brace", + Self::LeftBracket => "opening bracket", + Self::RightBracket => "closing bracket", + Self::LeftParen => "opening paren", + Self::RightParen => "closing paren", + Self::Comma => "comma", + Self::Semicolon => "semicolon", + Self::Colon => "colon", + Self::Star => "star", + Self::Underscore => "underscore", + Self::Dollar => "dollar sign", + Self::Plus => "plus", + Self::Minus => "minus", + Self::Slash => "slash", + Self::Hat => "hat", + Self::Prime => "prime", + Self::Dot => "dot", + Self::Eq => "equals sign", + Self::EqEq => "equality operator", + Self::ExclEq => "inequality operator", + Self::Lt => "less-than operator", + Self::LtEq => "less-than or equal operator", + Self::Gt => "greater-than operator", + Self::GtEq => "greater-than or equal operator", + Self::PlusEq => "add-assign operator", + Self::HyphEq => "subtract-assign operator", + Self::StarEq => "multiply-assign operator", + Self::SlashEq => "divide-assign operator", + Self::Dots => "dots", + Self::Arrow => "arrow", + Self::Root => "root", + Self::Not => "operator `not`", + Self::And => "operator `and`", + Self::Or => "operator `or`", + Self::None => "`none`", + Self::Auto => "`auto`", + Self::Let => "keyword `let`", + Self::Set => "keyword `set`", + Self::Show => "keyword `show`", + Self::If => "keyword `if`", + Self::Else => "keyword `else`", + Self::For => "keyword `for`", + Self::In => "keyword `in`", + Self::While => "keyword `while`", + Self::Break => "keyword `break`", + Self::Continue => "keyword `continue`", + Self::Return => "keyword `return`", + Self::Import => "keyword `import`", + Self::Include => "keyword `include`", + Self::As => "keyword `as`", + Self::Code => "code", + Self::Ident => "identifier", + Self::Bool => "boolean", + Self::Int => "integer", + Self::Float => "float", + Self::Numeric => "numeric value", + Self::Str => "string", + Self::CodeBlock => "code block", + Self::ContentBlock => "content block", + Self::Parenthesized => "group", + Self::Array => "array", + Self::Dict => "dictionary", + Self::Named => "named pair", + Self::Keyed => "keyed pair", + Self::Unary => "unary expression", + Self::Binary => "binary expression", + Self::FieldAccess => "field access", + Self::FuncCall => "function call", + Self::Args => "call arguments", + Self::Spread => "spread", + Self::Closure => "closure", + Self::Params => "closure parameters", + Self::LetBinding => "`let` expression", + Self::SetRule => "`set` expression", + Self::ShowRule => "`show` expression", + Self::Conditional => "`if` expression", + Self::WhileLoop => "while-loop expression", + Self::ForLoop => "for-loop expression", + Self::ModuleImport => "`import` expression", + Self::ImportItems => "import items", + Self::ModuleInclude => "`include` expression", + Self::LoopBreak => "`break` expression", + Self::LoopContinue => "`continue` expression", + Self::FuncReturn => "`return` expression", + Self::Destructuring => "destructuring pattern", + Self::DestructAssignment => "destructuring assignment expression", + Self::LineComment => "line comment", + Self::BlockComment => "block comment", + Self::Error => "syntax error", + Self::Eof => "end of file", + } + } +} diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs new file mode 100644 index 00000000..b96b3c07 --- /dev/null +++ b/crates/typst-syntax/src/lexer.rs @@ -0,0 +1,739 @@ +use ecow::{eco_format, EcoString}; +use unicode_ident::{is_xid_continue, is_xid_start}; +use unicode_segmentation::UnicodeSegmentation; +use unscanny::Scanner; + +use super::SyntaxKind; + +/// Splits up a string of source code into tokens. +#[derive(Clone)] +pub(super) struct Lexer<'s> { + /// The underlying scanner. + s: Scanner<'s>, + /// The mode the lexer is in. This determines which kinds of tokens it + /// produces. + mode: LexMode, + /// Whether the last token contained a newline. + newline: bool, + /// An error for the last token. + error: Option<EcoString>, +} + +/// What kind of tokens to emit. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(super) enum LexMode { + /// Text and markup. + Markup, + /// Math atoms, operators, etc. + Math, + /// Keywords, literals and operators. + Code, +} + +impl<'s> Lexer<'s> { + /// Create a new lexer with the given mode and a prefix to offset column + /// calculations. + pub fn new(text: &'s str, mode: LexMode) -> Self { + Self { + s: Scanner::new(text), + mode, + newline: false, + error: None, + } + } + + /// Get the current lexing mode. + pub fn mode(&self) -> LexMode { + self.mode + } + + /// Change the lexing mode. + pub fn set_mode(&mut self, mode: LexMode) { + self.mode = mode; + } + + /// The index in the string at which the last token ends and next token + /// will start. + pub fn cursor(&self) -> usize { + self.s.cursor() + } + + /// Jump to the given index in the string. + pub fn jump(&mut self, index: usize) { + self.s.jump(index); + } + + /// Whether the last token contained a newline. + pub fn newline(&self) -> bool { + self.newline + } + + /// Take out the last error, if any. + pub fn take_error(&mut self) -> Option<EcoString> { + self.error.take() + } +} + +impl Lexer<'_> { + /// Construct a full-positioned syntax error. + fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind { + self.error = Some(message.into()); + SyntaxKind::Error + } +} + +/// Shared. +impl Lexer<'_> { + pub fn next(&mut self) -> SyntaxKind { + self.newline = false; + self.error = None; + let start = self.s.cursor(); + match self.s.eat() { + Some(c) if c.is_whitespace() => self.whitespace(start, c), + Some('/') if self.s.eat_if('/') => self.line_comment(), + Some('/') if self.s.eat_if('*') => self.block_comment(), + Some('*') if self.s.eat_if('/') => { + self.error("unexpected end of block comment") + } + + Some(c) => match self.mode { + LexMode::Markup => self.markup(start, c), + LexMode::Math => self.math(start, c), + LexMode::Code => self.code(start, c), + }, + + None => SyntaxKind::Eof, + } + } + + fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { + let more = self.s.eat_while(char::is_whitespace); + let newlines = match c { + ' ' if more.is_empty() => 0, + _ => count_newlines(self.s.from(start)), + }; + + self.newline = newlines > 0; + if self.mode == LexMode::Markup && newlines >= 2 { + SyntaxKind::Parbreak + } else { + SyntaxKind::Space + } + } + + fn line_comment(&mut self) -> SyntaxKind { + self.s.eat_until(is_newline); + SyntaxKind::LineComment + } + + fn block_comment(&mut self) -> SyntaxKind { + let mut state = '_'; + let mut depth = 1; + + // Find the first `*/` that does not correspond to a nested `/*`. + while let Some(c) = self.s.eat() { + state = match (state, c) { + ('*', '/') => { + depth -= 1; + if depth == 0 { + break; + } + '_' + } + ('/', '*') => { + depth += 1; + '_' + } + ('/', '/') => { + self.line_comment(); + '_' + } + _ => c, + } + } + + SyntaxKind::BlockComment + } +} + +/// Markup. +impl Lexer<'_> { + fn markup(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + '\\' => self.backslash(), + '`' => self.raw(), + 'h' if self.s.eat_if("ttp://") => self.link(), + 'h' if self.s.eat_if("ttps://") => self.link(), + '<' if self.s.at(is_id_continue) => self.label(), + '@' => self.ref_marker(), + + '.' if self.s.eat_if("..") => SyntaxKind::Shorthand, + '-' if self.s.eat_if("--") => SyntaxKind::Shorthand, + '-' if self.s.eat_if('-') => SyntaxKind::Shorthand, + '-' if self.s.eat_if('?') => SyntaxKind::Shorthand, + '*' if !self.in_word() => SyntaxKind::Star, + '_' if !self.in_word() => SyntaxKind::Underscore, + + '#' => SyntaxKind::Hashtag, + '[' => SyntaxKind::LeftBracket, + ']' => SyntaxKind::RightBracket, + '\'' => SyntaxKind::SmartQuote, + '"' => SyntaxKind::SmartQuote, + '$' => SyntaxKind::Dollar, + '~' => SyntaxKind::Shorthand, + ':' => SyntaxKind::Colon, + '=' => { + self.s.eat_while('='); + if self.space_or_end() { + SyntaxKind::HeadingMarker + } else { + self.text() + } + } + '-' if self.space_or_end() => SyntaxKind::ListMarker, + '+' if self.space_or_end() => SyntaxKind::EnumMarker, + '/' if self.space_or_end() => SyntaxKind::TermMarker, + '0'..='9' => self.numbering(start), + + _ => self.text(), + } + } + + fn backslash(&mut self) -> SyntaxKind { + if self.s.eat_if("u{") { + let hex = self.s.eat_while(char::is_ascii_alphanumeric); + if !self.s.eat_if('}') { + return self.error("unclosed Unicode escape sequence"); + } + + if u32::from_str_radix(hex, 16) + .ok() + .and_then(std::char::from_u32) + .is_none() + { + return self.error(eco_format!("invalid Unicode codepoint: {}", hex)); + } + + return SyntaxKind::Escape; + } + + if self.s.done() || self.s.at(char::is_whitespace) { + SyntaxKind::Linebreak + } else { + self.s.eat(); + SyntaxKind::Escape + } + } + + fn raw(&mut self) -> SyntaxKind { + let mut backticks = 1; + while self.s.eat_if('`') { + backticks += 1; + } + + if backticks == 2 { + return SyntaxKind::Raw; + } + + let mut found = 0; + while found < backticks { + match self.s.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, + } + } + + if found != backticks { + return self.error("unclosed raw text"); + } + + SyntaxKind::Raw + } + + fn link(&mut self) -> SyntaxKind { + let mut brackets = Vec::new(); + + #[rustfmt::skip] + self.s.eat_while(|c: char| { + match c { + | '0' ..= '9' + | 'a' ..= 'z' + | 'A' ..= 'Z' + | '!' | '#' | '$' | '%' | '&' | '*' | '+' + | ',' | '-' | '.' | '/' | ':' | ';' | '=' + | '?' | '@' | '_' | '~' | '\'' => true, + '[' => { + brackets.push(SyntaxKind::LeftBracket); + true + } + '(' => { + brackets.push(SyntaxKind::LeftParen); + true + } + ']' => brackets.pop() == Some(SyntaxKind::LeftBracket), + ')' => brackets.pop() == Some(SyntaxKind::LeftParen), + _ => false, + } + }); + + if !brackets.is_empty() { + return self.error( + "automatic links cannot contain unbalanced brackets, \ + use the `link` function instead", + ); + } + + // Don't include the trailing characters likely to be part of text. + while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) { + self.s.uneat(); + } + + SyntaxKind::Link + } + + fn numbering(&mut self, start: usize) -> SyntaxKind { + self.s.eat_while(char::is_ascii_digit); + + let read = self.s.from(start); + if self.s.eat_if('.') && self.space_or_end() && read.parse::<usize>().is_ok() { + return SyntaxKind::EnumMarker; + } + + self.text() + } + + fn ref_marker(&mut self) -> SyntaxKind { + self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.')); + + // Don't include the trailing characters likely to be part of text. + while matches!(self.s.scout(-1), Some('.' | ':')) { + self.s.uneat(); + } + + SyntaxKind::RefMarker + } + + fn label(&mut self) -> SyntaxKind { + let label = self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.')); + if label.is_empty() { + return self.error("label cannot be empty"); + } + + if !self.s.eat_if('>') { + return self.error("unclosed label"); + } + + SyntaxKind::Label + } + + fn text(&mut self) -> SyntaxKind { + macro_rules! table { + ($(|$c:literal)*) => { + static TABLE: [bool; 128] = { + let mut t = [false; 128]; + $(t[$c as usize] = true;)* + t + }; + }; + } + + table! { + | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' + | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' + | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' + }; + + loop { + self.s.eat_until(|c: char| { + TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) + }); + + // Continue with the same text node if the thing would become text + // anyway. + let mut s = self.s; + match s.eat() { + Some(' ') if s.at(char::is_alphanumeric) => {} + Some('/') if !s.at(['/', '*']) => {} + Some('-') if !s.at(['-', '?']) => {} + Some('.') if !s.at("..") => {} + Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} + Some('@') if !s.at(is_id_start) => {} + _ => break, + } + + self.s = s; + } + + SyntaxKind::Text + } + + fn in_word(&self) -> bool { + let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); + let prev = self.s.scout(-2); + let next = self.s.peek(); + alphanum(prev) && alphanum(next) + } + + fn space_or_end(&self) -> bool { + self.s.done() || self.s.at(char::is_whitespace) + } +} + +/// Math. +impl Lexer<'_> { + fn math(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + '\\' => self.backslash(), + '"' => self.string(), + + '-' if self.s.eat_if(">>") => SyntaxKind::Shorthand, + '-' if self.s.eat_if('>') => SyntaxKind::Shorthand, + '-' if self.s.eat_if("->") => SyntaxKind::Shorthand, + ':' if self.s.eat_if('=') => SyntaxKind::Shorthand, + ':' if self.s.eat_if(":=") => SyntaxKind::Shorthand, + '!' if self.s.eat_if('=') => SyntaxKind::Shorthand, + '.' if self.s.eat_if("..") => SyntaxKind::Shorthand, + '[' if self.s.eat_if('|') => SyntaxKind::Shorthand, + '<' if self.s.eat_if("==>") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("-->") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("--") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("-<") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("->") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("<-") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("<<") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("==") => SyntaxKind::Shorthand, + '<' if self.s.eat_if("~~") => SyntaxKind::Shorthand, + '<' if self.s.eat_if('=') => SyntaxKind::Shorthand, + '<' if self.s.eat_if('<') => SyntaxKind::Shorthand, + '<' if self.s.eat_if('-') => SyntaxKind::Shorthand, + '<' if self.s.eat_if('~') => SyntaxKind::Shorthand, + '>' if self.s.eat_if("->") => SyntaxKind::Shorthand, + '>' if self.s.eat_if(">>") => SyntaxKind::Shorthand, + '=' if self.s.eat_if("=>") => SyntaxKind::Shorthand, + '=' if self.s.eat_if('>') => SyntaxKind::Shorthand, + '=' if self.s.eat_if(':') => SyntaxKind::Shorthand, + '>' if self.s.eat_if('=') => SyntaxKind::Shorthand, + '>' if self.s.eat_if('>') => SyntaxKind::Shorthand, + '|' if self.s.eat_if("->") => SyntaxKind::Shorthand, + '|' if self.s.eat_if("=>") => SyntaxKind::Shorthand, + '|' if self.s.eat_if(']') => SyntaxKind::Shorthand, + '|' if self.s.eat_if('|') => SyntaxKind::Shorthand, + '~' if self.s.eat_if("~>") => SyntaxKind::Shorthand, + '~' if self.s.eat_if('>') => SyntaxKind::Shorthand, + '*' | '-' => SyntaxKind::Shorthand, + + '#' => SyntaxKind::Hashtag, + '_' => SyntaxKind::Underscore, + '$' => SyntaxKind::Dollar, + '/' => SyntaxKind::Slash, + '^' => SyntaxKind::Hat, + '\'' => SyntaxKind::Prime, + '&' => SyntaxKind::MathAlignPoint, + '√' | '∛' | '∜' => SyntaxKind::Root, + + // Identifiers. + c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { + self.s.eat_while(is_math_id_continue); + SyntaxKind::MathIdent + } + + // Other math atoms. + _ => self.math_text(start, c), + } + } + + fn math_text(&mut self, start: usize, c: char) -> SyntaxKind { + // Keep numbers and grapheme clusters together. + if c.is_numeric() { + self.s.eat_while(char::is_numeric); + let mut s = self.s; + if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() { + self.s = s; + } + } else { + let len = self + .s + .get(start..self.s.string().len()) + .graphemes(true) + .next() + .map_or(0, str::len); + self.s.jump(start + len); + } + SyntaxKind::Text + } +} + +/// Code. +impl Lexer<'_> { + fn code(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + '`' => self.raw(), + '<' if self.s.at(is_id_continue) => self.label(), + '0'..='9' => self.number(start, c), + '.' if self.s.at(char::is_ascii_digit) => self.number(start, c), + '"' => self.string(), + + '=' if self.s.eat_if('=') => SyntaxKind::EqEq, + '!' if self.s.eat_if('=') => SyntaxKind::ExclEq, + '<' if self.s.eat_if('=') => SyntaxKind::LtEq, + '>' if self.s.eat_if('=') => SyntaxKind::GtEq, + '+' if self.s.eat_if('=') => SyntaxKind::PlusEq, + '-' if self.s.eat_if('=') => SyntaxKind::HyphEq, + '*' if self.s.eat_if('=') => SyntaxKind::StarEq, + '/' if self.s.eat_if('=') => SyntaxKind::SlashEq, + '.' if self.s.eat_if('.') => SyntaxKind::Dots, + '=' if self.s.eat_if('>') => SyntaxKind::Arrow, + + '{' => SyntaxKind::LeftBrace, + '}' => SyntaxKind::RightBrace, + '[' => SyntaxKind::LeftBracket, + ']' => SyntaxKind::RightBracket, + '(' => SyntaxKind::LeftParen, + ')' => SyntaxKind::RightParen, + '$' => SyntaxKind::Dollar, + ',' => SyntaxKind::Comma, + ';' => SyntaxKind::Semicolon, + ':' => SyntaxKind::Colon, + '.' => SyntaxKind::Dot, + '+' => SyntaxKind::Plus, + '-' => SyntaxKind::Minus, + '*' => SyntaxKind::Star, + '/' => SyntaxKind::Slash, + '=' => SyntaxKind::Eq, + '<' => SyntaxKind::Lt, + '>' => SyntaxKind::Gt, + + c if is_id_start(c) => self.ident(start), + + c => self.error(eco_format!("the character `{c}` is not valid in code")), + } + } + + fn ident(&mut self, start: usize) -> SyntaxKind { + self.s.eat_while(is_id_continue); + let ident = self.s.from(start); + + let prev = self.s.get(0..start); + if !prev.ends_with(['.', '@']) || prev.ends_with("..") { + if let Some(keyword) = keyword(ident) { + return keyword; + } + } + + if ident == "_" { + SyntaxKind::Underscore + } else { + SyntaxKind::Ident + } + } + + fn number(&mut self, mut start: usize, c: char) -> SyntaxKind { + // Handle alternative integer bases. + let mut base = 10; + if c == '0' { + if self.s.eat_if('b') { + base = 2; + } else if self.s.eat_if('o') { + base = 8; + } else if self.s.eat_if('x') { + base = 16; + } + if base != 10 { + start = self.s.cursor(); + } + } + + // Read the first part (integer or fractional depending on `first`). + self.s.eat_while(if base == 16 { + char::is_ascii_alphanumeric + } else { + char::is_ascii_digit + }); + + // Read the fractional part if not already done. + // Make sure not to confuse a range for the decimal separator. + if c != '.' + && !self.s.at("..") + && !self.s.scout(1).map_or(false, is_id_start) + && self.s.eat_if('.') + && base == 10 + { + self.s.eat_while(char::is_ascii_digit); + } + + // Read the exponent. + if !self.s.at("em") && self.s.eat_if(['e', 'E']) && base == 10 { + self.s.eat_if(['+', '-']); + self.s.eat_while(char::is_ascii_digit); + } + + // Read the suffix. + let suffix_start = self.s.cursor(); + if !self.s.eat_if('%') { + self.s.eat_while(char::is_ascii_alphanumeric); + } + + let number = self.s.get(start..suffix_start); + let suffix = self.s.from(suffix_start); + + let kind = if i64::from_str_radix(number, base).is_ok() { + SyntaxKind::Int + } else if base == 10 && number.parse::<f64>().is_ok() { + SyntaxKind::Float + } else { + return self.error(match base { + 2 => eco_format!("invalid binary number: 0b{}", number), + 8 => eco_format!("invalid octal number: 0o{}", number), + 16 => eco_format!("invalid hexadecimal number: 0x{}", number), + _ => eco_format!("invalid number: {}", number), + }); + }; + + if suffix.is_empty() { + return kind; + } + + if !matches!( + suffix, + "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%" + ) { + return self.error(eco_format!("invalid number suffix: {}", suffix)); + } + + SyntaxKind::Numeric + } + + fn string(&mut self) -> SyntaxKind { + let mut escaped = false; + self.s.eat_until(|c| { + let stop = c == '"' && !escaped; + escaped = c == '\\' && !escaped; + stop + }); + + if !self.s.eat_if('"') { + return self.error("unclosed string"); + } + + SyntaxKind::Str + } +} + +/// Try to parse an identifier into a keyword. +fn keyword(ident: &str) -> Option<SyntaxKind> { + Some(match ident { + "none" => SyntaxKind::None, + "auto" => SyntaxKind::Auto, + "true" => SyntaxKind::Bool, + "false" => SyntaxKind::Bool, + "not" => SyntaxKind::Not, + "and" => SyntaxKind::And, + "or" => SyntaxKind::Or, + "let" => SyntaxKind::Let, + "set" => SyntaxKind::Set, + "show" => SyntaxKind::Show, + "if" => SyntaxKind::If, + "else" => SyntaxKind::Else, + "for" => SyntaxKind::For, + "in" => SyntaxKind::In, + "while" => SyntaxKind::While, + "break" => SyntaxKind::Break, + "continue" => SyntaxKind::Continue, + "return" => SyntaxKind::Return, + "import" => SyntaxKind::Import, + "include" => SyntaxKind::Include, + "as" => SyntaxKind::As, + _ => return None, + }) +} + +/// Whether a character is interpreted as a newline by Typst. +#[inline] +pub fn is_newline(character: char) -> bool { + matches!( + character, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' + ) +} + +/// Split text at newlines. +pub(super) fn split_newlines(text: &str) -> Vec<&str> { + let mut s = Scanner::new(text); + let mut lines = Vec::new(); + let mut start = 0; + let mut end = 0; + + while let Some(c) = s.eat() { + if is_newline(c) { + if c == '\r' { + s.eat_if('\n'); + } + + lines.push(&text[start..end]); + start = s.cursor(); + } + end = s.cursor(); + } + + lines.push(&text[start..]); + lines +} + +/// Count the number of newlines in text. +fn count_newlines(text: &str) -> usize { + let mut newlines = 0; + let mut s = Scanner::new(text); + while let Some(c) = s.eat() { + if is_newline(c) { + if c == '\r' { + s.eat_if('\n'); + } + newlines += 1; + } + } + newlines +} + +/// Whether a string is a valid Typst identifier. +/// +/// In addition to what is specified in the [Unicode Standard][uax31], we allow: +/// - `_` as a starting character, +/// - `_` and `-` as continuing characters. +/// +/// [uax31]: http://www.unicode.org/reports/tr31/ +#[inline] +pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + chars + .next() + .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) +} + +/// Whether a character can start an identifier. +#[inline] +pub fn is_id_start(c: char) -> bool { + is_xid_start(c) || c == '_' +} + +/// Whether a character can continue an identifier. +#[inline] +pub fn is_id_continue(c: char) -> bool { + is_xid_continue(c) || c == '_' || c == '-' +} + +/// Whether a character can start an identifier in math. +#[inline] +fn is_math_id_start(c: char) -> bool { + is_xid_start(c) +} + +/// Whether a character can continue an identifier in math. +#[inline] +fn is_math_id_continue(c: char) -> bool { + is_xid_continue(c) && c != '_' +} diff --git a/crates/typst-syntax/src/lib.rs b/crates/typst-syntax/src/lib.rs new file mode 100644 index 00000000..8562bb19 --- /dev/null +++ b/crates/typst-syntax/src/lib.rs @@ -0,0 +1,23 @@ +//! Parser and syntax tree for Typst. + +pub mod ast; + +mod file; +mod kind; +mod lexer; +mod node; +mod parser; +mod reparser; +mod source; +mod span; + +pub use self::file::{FileId, PackageSpec, PackageVersion}; +pub use self::kind::SyntaxKind; +pub use self::lexer::{is_id_continue, is_id_start, is_ident, is_newline}; +pub use self::node::{LinkedChildren, LinkedNode, SyntaxError, SyntaxNode}; +pub use self::parser::{parse, parse_code, parse_math}; +pub use self::source::Source; +pub use self::span::{Span, Spanned}; + +use self::lexer::{split_newlines, LexMode, Lexer}; +use self::parser::{reparse_block, reparse_markup}; diff --git a/crates/typst-syntax/src/node.rs b/crates/typst-syntax/src/node.rs new file mode 100644 index 00000000..f949b4dd --- /dev/null +++ b/crates/typst-syntax/src/node.rs @@ -0,0 +1,912 @@ +use std::fmt::{self, Debug, Display, Formatter}; +use std::ops::{Deref, Range}; +use std::rc::Rc; +use std::sync::Arc; + +use ecow::EcoString; + +use super::ast::AstNode; +use super::{FileId, Span, SyntaxKind}; + +/// A node in the untyped syntax tree. +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct SyntaxNode(Repr); + +/// The three internal representations. +#[derive(Clone, Eq, PartialEq, Hash)] +enum Repr { + /// A leaf node. + Leaf(LeafNode), + /// A reference-counted inner node. + Inner(Arc<InnerNode>), + /// An error node. + Error(Arc<ErrorNode>), +} + +impl SyntaxNode { + /// Create a new leaf node. + pub fn leaf(kind: SyntaxKind, text: impl Into<EcoString>) -> Self { + Self(Repr::Leaf(LeafNode::new(kind, text))) + } + + /// Create a new inner node with children. + pub fn inner(kind: SyntaxKind, children: Vec<SyntaxNode>) -> Self { + Self(Repr::Inner(Arc::new(InnerNode::new(kind, children)))) + } + + /// Create a new error node. + pub fn error(message: impl Into<EcoString>, text: impl Into<EcoString>) -> Self { + Self(Repr::Error(Arc::new(ErrorNode::new(message, text)))) + } + + /// The type of the node. + pub fn kind(&self) -> SyntaxKind { + match &self.0 { + Repr::Leaf(leaf) => leaf.kind, + Repr::Inner(inner) => inner.kind, + Repr::Error(_) => SyntaxKind::Error, + } + } + + /// Return `true` if the length is 0. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// The byte length of the node in the source text. + pub fn len(&self) -> usize { + match &self.0 { + Repr::Leaf(leaf) => leaf.len(), + Repr::Inner(inner) => inner.len, + Repr::Error(node) => node.len(), + } + } + + /// The span of the node. + pub fn span(&self) -> Span { + match &self.0 { + Repr::Leaf(leaf) => leaf.span, + Repr::Inner(inner) => inner.span, + Repr::Error(node) => node.error.span, + } + } + + /// The text of the node if it is a leaf or error node. + /// + /// Returns the empty string if this is an inner node. + pub fn text(&self) -> &EcoString { + static EMPTY: EcoString = EcoString::new(); + match &self.0 { + Repr::Leaf(leaf) => &leaf.text, + Repr::Inner(_) => &EMPTY, + Repr::Error(node) => &node.text, + } + } + + /// Extract the text from the node. + /// + /// Builds the string if this is an inner node. + pub fn into_text(self) -> EcoString { + match self.0 { + Repr::Leaf(leaf) => leaf.text, + Repr::Inner(inner) => { + inner.children.iter().cloned().map(Self::into_text).collect() + } + Repr::Error(node) => node.text.clone(), + } + } + + /// The node's children. + pub fn children(&self) -> std::slice::Iter<'_, SyntaxNode> { + match &self.0 { + Repr::Leaf(_) | Repr::Error(_) => [].iter(), + Repr::Inner(inner) => inner.children.iter(), + } + } + + /// Whether the node can be cast to the given AST node. + pub fn is<T: AstNode>(&self) -> bool { + self.cast::<T>().is_some() + } + + /// Try to convert the node to a typed AST node. + pub fn cast<T: AstNode>(&self) -> Option<T> { + T::from_untyped(self) + } + + /// Cast the first child that can cast to the AST type `T`. + pub fn cast_first_match<T: AstNode>(&self) -> Option<T> { + self.children().find_map(Self::cast) + } + + /// Cast the last child that can cast to the AST type `T`. + pub fn cast_last_match<T: AstNode>(&self) -> Option<T> { + self.children().rev().find_map(Self::cast) + } + + /// Whether the node or its children contain an error. + pub fn erroneous(&self) -> bool { + match &self.0 { + Repr::Leaf(_) => false, + Repr::Inner(inner) => inner.erroneous, + Repr::Error(_) => true, + } + } + + /// The error messages for this node and its descendants. + pub fn errors(&self) -> Vec<SyntaxError> { + if !self.erroneous() { + return vec![]; + } + + if let Repr::Error(node) = &self.0 { + vec![node.error.clone()] + } else { + self.children() + .filter(|node| node.erroneous()) + .flat_map(|node| node.errors()) + .collect() + } + } + + /// Add a user-presentable hint if this is an error node. + pub fn hint(&mut self, hint: impl Into<EcoString>) { + if let Repr::Error(node) = &mut self.0 { + Arc::make_mut(node).hint(hint); + } + } + + /// Set a synthetic span for the node and all its descendants. + pub fn synthesize(&mut self, span: Span) { + match &mut self.0 { + Repr::Leaf(leaf) => leaf.span = span, + Repr::Inner(inner) => Arc::make_mut(inner).synthesize(span), + Repr::Error(node) => Arc::make_mut(node).error.span = span, + } + } +} + +impl SyntaxNode { + /// Mark this node as erroneous. + pub(super) fn make_erroneous(&mut self) { + if let Repr::Inner(inner) = &mut self.0 { + Arc::make_mut(inner).erroneous = true; + } + } + + /// Convert the child to another kind. + #[track_caller] + pub(super) fn convert_to_kind(&mut self, kind: SyntaxKind) { + debug_assert!(!kind.is_error()); + match &mut self.0 { + Repr::Leaf(leaf) => leaf.kind = kind, + Repr::Inner(inner) => Arc::make_mut(inner).kind = kind, + Repr::Error(_) => panic!("cannot convert error"), + } + } + + /// Convert the child to an error. + pub(super) fn convert_to_error(&mut self, message: impl Into<EcoString>) { + let text = std::mem::take(self).into_text(); + *self = SyntaxNode::error(message, text); + } + + /// Assign spans to each node. + #[tracing::instrument(skip_all)] + pub(super) fn numberize( + &mut self, + id: FileId, + within: Range<u64>, + ) -> NumberingResult { + if within.start >= within.end { + return Err(Unnumberable); + } + + let mid = Span::new(id, (within.start + within.end) / 2); + match &mut self.0 { + Repr::Leaf(leaf) => leaf.span = mid, + Repr::Inner(inner) => Arc::make_mut(inner).numberize(id, None, within)?, + Repr::Error(node) => Arc::make_mut(node).error.span = mid, + } + + Ok(()) + } + + /// Whether this is a leaf node. + pub(super) fn is_leaf(&self) -> bool { + matches!(self.0, Repr::Leaf(_)) + } + + /// The number of descendants, including the node itself. + pub(super) fn descendants(&self) -> usize { + match &self.0 { + Repr::Leaf(_) | Repr::Error(_) => 1, + Repr::Inner(inner) => inner.descendants, + } + } + + /// The node's children, mutably. + pub(super) fn children_mut(&mut self) -> &mut [SyntaxNode] { + match &mut self.0 { + Repr::Leaf(_) | Repr::Error(_) => &mut [], + Repr::Inner(inner) => &mut Arc::make_mut(inner).children, + } + } + + /// Replaces a range of children with a replacement. + /// + /// May have mutated the children if it returns `Err(_)`. + pub(super) fn replace_children( + &mut self, + range: Range<usize>, + replacement: Vec<SyntaxNode>, + ) -> NumberingResult { + if let Repr::Inner(inner) = &mut self.0 { + Arc::make_mut(inner).replace_children(range, replacement)?; + } + Ok(()) + } + + /// Update this node after changes were made to one of its children. + pub(super) fn update_parent( + &mut self, + prev_len: usize, + new_len: usize, + prev_descendants: usize, + new_descendants: usize, + ) { + if let Repr::Inner(inner) = &mut self.0 { + Arc::make_mut(inner).update_parent( + prev_len, + new_len, + prev_descendants, + new_descendants, + ); + } + } + + /// The upper bound of assigned numbers in this subtree. + pub(super) fn upper(&self) -> u64 { + match &self.0 { + Repr::Leaf(leaf) => leaf.span.number() + 1, + Repr::Inner(inner) => inner.upper, + Repr::Error(node) => node.error.span.number() + 1, + } + } +} + +impl Debug for SyntaxNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + match &self.0 { + Repr::Leaf(leaf) => leaf.fmt(f), + Repr::Inner(inner) => inner.fmt(f), + Repr::Error(node) => node.fmt(f), + } + } +} + +impl Default for SyntaxNode { + fn default() -> Self { + Self::error("", "") + } +} + +/// A leaf node in the untyped syntax tree. +#[derive(Clone, Eq, PartialEq, Hash)] +struct LeafNode { + /// What kind of node this is (each kind would have its own struct in a + /// strongly typed AST). + kind: SyntaxKind, + /// The source text of the node. + text: EcoString, + /// The node's span. + span: Span, +} + +impl LeafNode { + /// Create a new leaf node. + #[track_caller] + fn new(kind: SyntaxKind, text: impl Into<EcoString>) -> Self { + debug_assert!(!kind.is_error()); + Self { kind, text: text.into(), span: Span::detached() } + } + + /// The byte length of the node in the source text. + fn len(&self) -> usize { + self.text.len() + } +} + +impl Debug for LeafNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?}: {:?}", self.kind, self.text) + } +} + +/// An inner node in the untyped syntax tree. +#[derive(Clone, Eq, PartialEq, Hash)] +struct InnerNode { + /// What kind of node this is (each kind would have its own struct in a + /// strongly typed AST). + kind: SyntaxKind, + /// The byte length of the node in the source. + len: usize, + /// The node's span. + span: Span, + /// The number of nodes in the whole subtree, including this node. + descendants: usize, + /// Whether this node or any of its children are erroneous. + erroneous: bool, + /// The upper bound of this node's numbering range. + upper: u64, + /// This node's children, losslessly make up this node. + children: Vec<SyntaxNode>, +} + +impl InnerNode { + /// Create a new inner node with the given kind and children. + #[track_caller] + fn new(kind: SyntaxKind, children: Vec<SyntaxNode>) -> Self { + debug_assert!(!kind.is_error()); + + let mut len = 0; + let mut descendants = 1; + let mut erroneous = false; + + for child in &children { + len += child.len(); + descendants += child.descendants(); + erroneous |= child.erroneous(); + } + + Self { + kind, + len, + span: Span::detached(), + descendants, + erroneous, + upper: 0, + children, + } + } + + /// Set a synthetic span for the node and all its descendants. + fn synthesize(&mut self, span: Span) { + self.span = span; + self.upper = span.number(); + for child in &mut self.children { + child.synthesize(span); + } + } + + /// Assign span numbers `within` an interval to this node's subtree or just + /// a `range` of its children. + fn numberize( + &mut self, + id: FileId, + range: Option<Range<usize>>, + within: Range<u64>, + ) -> NumberingResult { + // Determine how many nodes we will number. + let descendants = match &range { + Some(range) if range.is_empty() => return Ok(()), + Some(range) => self.children[range.clone()] + .iter() + .map(SyntaxNode::descendants) + .sum::<usize>(), + None => self.descendants, + }; + + // Determine the distance between two neighbouring assigned numbers. If + // possible, we try to fit all numbers into the left half of `within` + // so that there is space for future insertions. + let space = within.end - within.start; + let mut stride = space / (2 * descendants as u64); + if stride == 0 { + stride = space / self.descendants as u64; + if stride == 0 { + return Err(Unnumberable); + } + } + + // Number the node itself. + let mut start = within.start; + if range.is_none() { + let end = start + stride; + self.span = Span::new(id, (start + end) / 2); + self.upper = within.end; + start = end; + } + + // Number the children. + let len = self.children.len(); + for child in &mut self.children[range.unwrap_or(0..len)] { + let end = start + child.descendants() as u64 * stride; + child.numberize(id, start..end)?; + start = end; + } + + Ok(()) + } + + /// Replaces a range of children with a replacement. + /// + /// May have mutated the children if it returns `Err(_)`. + fn replace_children( + &mut self, + mut range: Range<usize>, + replacement: Vec<SyntaxNode>, + ) -> NumberingResult { + let superseded = &self.children[range.clone()]; + + // Compute the new byte length. + self.len = self.len + replacement.iter().map(SyntaxNode::len).sum::<usize>() + - superseded.iter().map(SyntaxNode::len).sum::<usize>(); + + // Compute the new number of descendants. + self.descendants = self.descendants + + replacement.iter().map(SyntaxNode::descendants).sum::<usize>() + - superseded.iter().map(SyntaxNode::descendants).sum::<usize>(); + + // Determine whether we're still erroneous after the replacement. That's + // the case if + // - any of the new nodes is erroneous, + // - or if we were erroneous before due to a non-superseded node. + self.erroneous = replacement.iter().any(SyntaxNode::erroneous) + || (self.erroneous + && (self.children[..range.start].iter().any(SyntaxNode::erroneous)) + || self.children[range.end..].iter().any(SyntaxNode::erroneous)); + + // Perform the replacement. + let replacement_count = replacement.len(); + self.children.splice(range.clone(), replacement); + range.end = range.start + replacement_count; + + // Renumber the new children. Retries until it works, taking + // exponentially more children into account. + let mut left = 0; + let mut right = 0; + let max_left = range.start; + let max_right = self.children.len() - range.end; + loop { + let renumber = range.start - left..range.end + right; + + // The minimum assignable number is either + // - the upper bound of the node right before the to-be-renumbered + // children, + // - or this inner node's span number plus one if renumbering starts + // at the first child. + let start_number = renumber + .start + .checked_sub(1) + .and_then(|i| self.children.get(i)) + .map_or(self.span.number() + 1, |child| child.upper()); + + // The upper bound for renumbering is either + // - the span number of the first child after the to-be-renumbered + // children, + // - or this node's upper bound if renumbering ends behind the last + // child. + let end_number = self + .children + .get(renumber.end) + .map_or(self.upper, |next| next.span().number()); + + // Try to renumber. + let within = start_number..end_number; + let id = self.span.id(); + if self.numberize(id, Some(renumber), within).is_ok() { + return Ok(()); + } + + // If it didn't even work with all children, we give up. + if left == max_left && right == max_right { + return Err(Unnumberable); + } + + // Exponential expansion to both sides. + left = (left + 1).next_power_of_two().min(max_left); + right = (right + 1).next_power_of_two().min(max_right); + } + } + + /// Update this node after changes were made to one of its children. + fn update_parent( + &mut self, + prev_len: usize, + new_len: usize, + prev_descendants: usize, + new_descendants: usize, + ) { + self.len = self.len + new_len - prev_len; + self.descendants = self.descendants + new_descendants - prev_descendants; + self.erroneous = self.children.iter().any(SyntaxNode::erroneous); + } +} + +impl Debug for InnerNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?}: {}", self.kind, self.len)?; + if !self.children.is_empty() { + f.write_str(" ")?; + f.debug_list().entries(&self.children).finish()?; + } + Ok(()) + } +} + +/// An error node in the untyped syntax tree. +#[derive(Clone, Eq, PartialEq, Hash)] +struct ErrorNode { + /// The source text of the node. + text: EcoString, + /// The syntax error. + error: SyntaxError, +} + +impl ErrorNode { + /// Create new error node. + fn new(message: impl Into<EcoString>, text: impl Into<EcoString>) -> Self { + Self { + text: text.into(), + error: SyntaxError { + span: Span::detached(), + message: message.into(), + hints: vec![], + }, + } + } + + /// The byte length of the node in the source text. + fn len(&self) -> usize { + self.text.len() + } + + /// Add a user-presentable hint to this error node. + fn hint(&mut self, hint: impl Into<EcoString>) { + self.error.hints.push(hint.into()); + } +} + +impl Debug for ErrorNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Error: {:?} ({})", self.text, self.error.message) + } +} + +/// A syntactical error. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct SyntaxError { + /// The node's span. + pub span: Span, + /// The error message. + pub message: EcoString, + /// Additonal hints to the user, indicating how this error could be avoided + /// or worked around. + pub hints: Vec<EcoString>, +} + +/// A syntax node in a context. +/// +/// Knows its exact offset in the file and provides access to its +/// children, parent and siblings. +/// +/// **Note that all sibling and leaf accessors skip over trivia!** +#[derive(Clone)] +pub struct LinkedNode<'a> { + node: &'a SyntaxNode, + parent: Option<Rc<Self>>, + index: usize, + offset: usize, +} + +impl<'a> LinkedNode<'a> { + /// Start a new traversal at a root node. + pub fn new(root: &'a SyntaxNode) -> Self { + Self { node: root, parent: None, index: 0, offset: 0 } + } + + /// Get the contained syntax node. + pub fn get(&self) -> &'a SyntaxNode { + self.node + } + + /// The index of this node in its parent's children list. + pub fn index(&self) -> usize { + self.index + } + + /// The absolute byte offset of this node in the source file. + pub fn offset(&self) -> usize { + self.offset + } + + /// The byte range of this node in the source file. + pub fn range(&self) -> Range<usize> { + self.offset..self.offset + self.node.len() + } + + /// An iterator over this node's children. + pub fn children(&self) -> LinkedChildren<'a> { + LinkedChildren { + parent: Rc::new(self.clone()), + iter: self.node.children().enumerate(), + front: self.offset, + back: self.offset + self.len(), + } + } + + /// Find a descendant with the given span. + pub fn find(&self, span: Span) -> Option<LinkedNode<'a>> { + if self.span() == span { + return Some(self.clone()); + } + + if let Repr::Inner(inner) = &self.0 { + // The parent of a subtree has a smaller span number than all of its + // descendants. Therefore, we can bail out early if the target span's + // number is smaller than our number. + if span.number() < inner.span.number() { + return None; + } + + let mut children = self.children().peekable(); + while let Some(child) = children.next() { + // Every node in this child's subtree has a smaller span number than + // the next sibling. Therefore we only need to recurse if the next + // sibling's span number is larger than the target span's number. + if children + .peek() + .map_or(true, |next| next.span().number() > span.number()) + { + if let Some(found) = child.find(span) { + return Some(found); + } + } + } + } + + None + } +} + +/// Access to parents and siblings. +impl<'a> LinkedNode<'a> { + /// Get this node's parent. + pub fn parent(&self) -> Option<&Self> { + self.parent.as_deref() + } + + /// Get the first previous non-trivia sibling node. + pub fn prev_sibling(&self) -> Option<Self> { + let parent = self.parent()?; + let index = self.index.checked_sub(1)?; + let node = parent.node.children().nth(index)?; + let offset = self.offset - node.len(); + let prev = Self { node, parent: self.parent.clone(), index, offset }; + if prev.kind().is_trivia() { + prev.prev_sibling() + } else { + Some(prev) + } + } + + /// Get the next non-trivia sibling node. + pub fn next_sibling(&self) -> Option<Self> { + let parent = self.parent()?; + let index = self.index.checked_add(1)?; + let node = parent.node.children().nth(index)?; + let offset = self.offset + self.node.len(); + let next = Self { node, parent: self.parent.clone(), index, offset }; + if next.kind().is_trivia() { + next.next_sibling() + } else { + Some(next) + } + } + + /// Get the kind of this node's parent. + pub fn parent_kind(&self) -> Option<SyntaxKind> { + Some(self.parent()?.node.kind()) + } + + /// Get the kind of this node's first previous non-trivia sibling. + pub fn prev_sibling_kind(&self) -> Option<SyntaxKind> { + Some(self.prev_sibling()?.node.kind()) + } + + /// Get the kind of this node's next non-trivia sibling. + pub fn next_sibling_kind(&self) -> Option<SyntaxKind> { + Some(self.next_sibling()?.node.kind()) + } +} + +/// Access to leafs. +impl<'a> LinkedNode<'a> { + /// Get the rightmost non-trivia leaf before this node. + pub fn prev_leaf(&self) -> Option<Self> { + let mut node = self.clone(); + while let Some(prev) = node.prev_sibling() { + if let Some(leaf) = prev.rightmost_leaf() { + return Some(leaf); + } + node = prev; + } + self.parent()?.prev_leaf() + } + + /// Find the leftmost contained non-trivia leaf. + pub fn leftmost_leaf(&self) -> Option<Self> { + if self.is_leaf() && !self.kind().is_trivia() && !self.kind().is_error() { + return Some(self.clone()); + } + + for child in self.children() { + if let Some(leaf) = child.leftmost_leaf() { + return Some(leaf); + } + } + + None + } + + /// Get the leaf at the specified byte offset. + pub fn leaf_at(&self, cursor: usize) -> Option<Self> { + if self.node.children().len() == 0 && cursor <= self.offset + self.len() { + return Some(self.clone()); + } + + let mut offset = self.offset; + let count = self.node.children().len(); + for (i, child) in self.children().enumerate() { + let len = child.len(); + if (offset < cursor && cursor <= offset + len) + || (offset == cursor && i + 1 == count) + { + return child.leaf_at(cursor); + } + offset += len; + } + + None + } + + /// Find the rightmost contained non-trivia leaf. + pub fn rightmost_leaf(&self) -> Option<Self> { + if self.is_leaf() && !self.kind().is_trivia() { + return Some(self.clone()); + } + + for child in self.children().rev() { + if let Some(leaf) = child.rightmost_leaf() { + return Some(leaf); + } + } + + None + } + + /// Get the leftmost non-trivia leaf after this node. + pub fn next_leaf(&self) -> Option<Self> { + let mut node = self.clone(); + while let Some(next) = node.next_sibling() { + if let Some(leaf) = next.leftmost_leaf() { + return Some(leaf); + } + node = next; + } + self.parent()?.next_leaf() + } +} + +impl Deref for LinkedNode<'_> { + type Target = SyntaxNode; + + fn deref(&self) -> &Self::Target { + self.get() + } +} + +impl Debug for LinkedNode<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + self.node.fmt(f) + } +} + +/// An iterator over the children of a linked node. +pub struct LinkedChildren<'a> { + parent: Rc<LinkedNode<'a>>, + iter: std::iter::Enumerate<std::slice::Iter<'a, SyntaxNode>>, + front: usize, + back: usize, +} + +impl<'a> Iterator for LinkedChildren<'a> { + type Item = LinkedNode<'a>; + + fn next(&mut self) -> Option<Self::Item> { + self.iter.next().map(|(index, node)| { + let offset = self.front; + self.front += node.len(); + LinkedNode { + node, + parent: Some(self.parent.clone()), + index, + offset, + } + }) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.iter.size_hint() + } +} + +impl DoubleEndedIterator for LinkedChildren<'_> { + fn next_back(&mut self) -> Option<Self::Item> { + self.iter.next_back().map(|(index, node)| { + self.back -= node.len(); + LinkedNode { + node, + parent: Some(self.parent.clone()), + index, + offset: self.back, + } + }) + } +} + +impl ExactSizeIterator for LinkedChildren<'_> {} + +/// Result of numbering a node within an interval. +pub(super) type NumberingResult = Result<(), Unnumberable>; + +/// Indicates that a node cannot be numbered within a given interval. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(super) struct Unnumberable; + +impl Display for Unnumberable { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.pad("cannot number within this interval") + } +} + +impl std::error::Error for Unnumberable {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Source; + + #[test] + fn test_linked_node() { + let source = Source::detached("#set text(12pt, red)"); + + // Find "text". + let node = LinkedNode::new(source.root()).leaf_at(7).unwrap(); + assert_eq!(node.offset(), 5); + assert_eq!(node.text(), "text"); + + // Go back to "#set". Skips the space. + let prev = node.prev_sibling().unwrap(); + assert_eq!(prev.offset(), 1); + assert_eq!(prev.text(), "set"); + } + + #[test] + fn test_linked_node_non_trivia_leaf() { + let source = Source::detached("#set fun(12pt, red)"); + let leaf = LinkedNode::new(source.root()).leaf_at(6).unwrap(); + let prev = leaf.prev_leaf().unwrap(); + assert_eq!(leaf.text(), "fun"); + assert_eq!(prev.text(), "set"); + + let source = Source::detached("#let x = 10"); + let leaf = LinkedNode::new(source.root()).leaf_at(9).unwrap(); + let prev = leaf.prev_leaf().unwrap(); + let next = leaf.next_leaf().unwrap(); + assert_eq!(prev.text(), "="); + assert_eq!(leaf.text(), " "); + assert_eq!(next.text(), "10"); + } +} diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs new file mode 100644 index 00000000..24cf7116 --- /dev/null +++ b/crates/typst-syntax/src/parser.rs @@ -0,0 +1,1760 @@ +use std::collections::HashSet; +use std::ops::Range; + +use ecow::{eco_format, EcoString}; +use unicode_math_class::MathClass; + +use super::{ast, is_newline, LexMode, Lexer, SyntaxKind, SyntaxNode}; + +/// Parse a source file. +#[tracing::instrument(skip_all)] +pub fn parse(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, 0, LexMode::Markup); + markup(&mut p, true, 0, |_| false); + p.finish().into_iter().next().unwrap() +} + +/// Parse top-level code. +#[tracing::instrument(skip_all)] +pub fn parse_code(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, 0, LexMode::Code); + let m = p.marker(); + p.skip(); + code_exprs(&mut p, |_| false); + p.wrap_all(m, SyntaxKind::Code); + p.finish().into_iter().next().unwrap() +} + +/// Parse top-level math. +#[tracing::instrument(skip_all)] +pub fn parse_math(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, 0, LexMode::Math); + math(&mut p, |_| false); + p.finish().into_iter().next().unwrap() +} + +fn markup( + p: &mut Parser, + mut at_start: bool, + min_indent: usize, + mut stop: impl FnMut(&Parser) -> bool, +) { + let m = p.marker(); + let mut nesting: usize = 0; + while !p.eof() { + match p.current() { + SyntaxKind::LeftBracket => nesting += 1, + SyntaxKind::RightBracket if nesting > 0 => nesting -= 1, + _ if stop(p) => break, + _ => {} + } + + if p.newline() { + at_start = true; + if min_indent > 0 && p.column(p.current_end()) < min_indent { + break; + } + p.eat(); + continue; + } + + let prev = p.prev_end(); + markup_expr(p, &mut at_start); + if !p.progress(prev) { + p.unexpected(); + } + } + p.wrap(m, SyntaxKind::Markup); +} + +pub(super) fn reparse_markup( + text: &str, + range: Range<usize>, + at_start: &mut bool, + nesting: &mut usize, + mut stop: impl FnMut(SyntaxKind) -> bool, +) -> Option<Vec<SyntaxNode>> { + let mut p = Parser::new(text, range.start, LexMode::Markup); + while !p.eof() && p.current_start() < range.end { + match p.current() { + SyntaxKind::LeftBracket => *nesting += 1, + SyntaxKind::RightBracket if *nesting > 0 => *nesting -= 1, + _ if stop(p.current()) => break, + _ => {} + } + + if p.newline() { + *at_start = true; + p.eat(); + continue; + } + + let prev = p.prev_end(); + markup_expr(&mut p, at_start); + if !p.progress(prev) { + p.unexpected(); + } + } + (p.balanced && p.current_start() == range.end).then(|| p.finish()) +} + +fn markup_expr(p: &mut Parser, at_start: &mut bool) { + match p.current() { + SyntaxKind::Space + | SyntaxKind::Parbreak + | SyntaxKind::LineComment + | SyntaxKind::BlockComment => { + p.eat(); + return; + } + + SyntaxKind::Text + | SyntaxKind::Linebreak + | SyntaxKind::Escape + | SyntaxKind::Shorthand + | SyntaxKind::SmartQuote + | SyntaxKind::Raw + | SyntaxKind::Link + | SyntaxKind::Label => p.eat(), + + SyntaxKind::Hashtag => embedded_code_expr(p), + SyntaxKind::Star => strong(p), + SyntaxKind::Underscore => emph(p), + SyntaxKind::HeadingMarker if *at_start => heading(p), + SyntaxKind::ListMarker if *at_start => list_item(p), + SyntaxKind::EnumMarker if *at_start => enum_item(p), + SyntaxKind::TermMarker if *at_start => term_item(p), + SyntaxKind::RefMarker => reference(p), + SyntaxKind::Dollar => equation(p), + + SyntaxKind::LeftBracket + | SyntaxKind::RightBracket + | SyntaxKind::HeadingMarker + | SyntaxKind::ListMarker + | SyntaxKind::EnumMarker + | SyntaxKind::TermMarker + | SyntaxKind::Colon => p.convert(SyntaxKind::Text), + + _ => {} + } + + *at_start = false; +} + +fn strong(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Star); + markup(p, false, 0, |p| { + p.at(SyntaxKind::Star) + || p.at(SyntaxKind::Parbreak) + || p.at(SyntaxKind::RightBracket) + }); + p.expect_closing_delimiter(m, SyntaxKind::Star); + p.wrap(m, SyntaxKind::Strong); +} + +fn emph(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Underscore); + markup(p, false, 0, |p| { + p.at(SyntaxKind::Underscore) + || p.at(SyntaxKind::Parbreak) + || p.at(SyntaxKind::RightBracket) + }); + p.expect_closing_delimiter(m, SyntaxKind::Underscore); + p.wrap(m, SyntaxKind::Emph); +} + +fn heading(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::HeadingMarker); + whitespace_line(p); + markup(p, false, usize::MAX, |p| { + p.at(SyntaxKind::Label) + || p.at(SyntaxKind::RightBracket) + || (p.at(SyntaxKind::Space) && p.lexer.clone().next() == SyntaxKind::Label) + }); + p.wrap(m, SyntaxKind::Heading); +} + +fn list_item(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::ListMarker); + let min_indent = p.column(p.prev_end()); + whitespace_line(p); + markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); + p.wrap(m, SyntaxKind::ListItem); +} + +fn enum_item(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::EnumMarker); + let min_indent = p.column(p.prev_end()); + whitespace_line(p); + markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); + p.wrap(m, SyntaxKind::EnumItem); +} + +fn term_item(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::TermMarker); + let min_indent = p.column(p.prev_end()); + whitespace_line(p); + markup(p, false, usize::MAX, |p| { + p.at(SyntaxKind::Colon) || p.at(SyntaxKind::RightBracket) + }); + p.expect(SyntaxKind::Colon); + whitespace_line(p); + markup(p, false, min_indent, |p| p.at(SyntaxKind::RightBracket)); + p.wrap(m, SyntaxKind::TermItem); +} + +fn reference(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::RefMarker); + if p.directly_at(SyntaxKind::LeftBracket) { + content_block(p); + } + p.wrap(m, SyntaxKind::Ref); +} + +fn whitespace_line(p: &mut Parser) { + while !p.newline() && p.current().is_trivia() { + p.eat(); + } +} + +fn equation(p: &mut Parser) { + let m = p.marker(); + p.enter(LexMode::Math); + p.assert(SyntaxKind::Dollar); + math(p, |p| p.at(SyntaxKind::Dollar)); + p.expect_closing_delimiter(m, SyntaxKind::Dollar); + p.exit(); + p.wrap(m, SyntaxKind::Equation); +} + +fn math(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { + let m = p.marker(); + while !p.eof() && !stop(p) { + let prev = p.prev_end(); + math_expr(p); + if !p.progress(prev) { + p.unexpected(); + } + } + p.wrap(m, SyntaxKind::Math); +} + +fn math_expr(p: &mut Parser) { + math_expr_prec(p, 0, SyntaxKind::Eof) +} + +fn math_expr_prec(p: &mut Parser, min_prec: usize, stop: SyntaxKind) { + let m = p.marker(); + let mut continuable = false; + match p.current() { + SyntaxKind::Hashtag => embedded_code_expr(p), + SyntaxKind::MathIdent => { + continuable = true; + p.eat(); + while p.directly_at(SyntaxKind::Text) + && p.current_text() == "." + && matches!( + p.lexer.clone().next(), + SyntaxKind::MathIdent | SyntaxKind::Text + ) + { + p.convert(SyntaxKind::Dot); + p.convert(SyntaxKind::Ident); + p.wrap(m, SyntaxKind::FieldAccess); + } + if min_prec < 3 && p.directly_at(SyntaxKind::Text) && p.current_text() == "(" + { + math_args(p); + p.wrap(m, SyntaxKind::FuncCall); + continuable = false; + } + } + + SyntaxKind::Text | SyntaxKind::Shorthand => { + continuable = matches!( + math_class(p.current_text()), + None | Some(MathClass::Alphabetic) + ); + if !maybe_delimited(p, true) { + p.eat(); + } + } + + SyntaxKind::Linebreak | SyntaxKind::MathAlignPoint => p.eat(), + SyntaxKind::Escape | SyntaxKind::Str => { + continuable = true; + p.eat(); + } + + SyntaxKind::Root => { + if min_prec < 3 { + p.eat(); + let m2 = p.marker(); + math_expr_prec(p, 2, stop); + math_unparen(p, m2); + p.wrap(m, SyntaxKind::MathRoot); + } + } + + SyntaxKind::Prime => { + // Means that there is nothing to attach the prime to. + continuable = true; + while p.at(SyntaxKind::Prime) { + let m2 = p.marker(); + p.eat(); + // Eat the group until the space. + while p.eat_if_direct(SyntaxKind::Prime) {} + p.wrap(m2, SyntaxKind::MathPrimes); + } + } + + _ => p.expected("expression"), + } + + if continuable + && min_prec < 3 + && p.prev_end() == p.current_start() + && maybe_delimited(p, false) + { + p.wrap(m, SyntaxKind::Math); + } + + // Whether there were _any_ primes in the loop. + let mut primed = false; + + while !p.eof() && !p.at(stop) { + if p.directly_at(SyntaxKind::Text) && p.current_text() == "!" { + p.eat(); + p.wrap(m, SyntaxKind::Math); + continue; + } + + let prime_marker = p.marker(); + if p.eat_if_direct(SyntaxKind::Prime) { + // Eat as many primes as possible. + while p.eat_if_direct(SyntaxKind::Prime) {} + p.wrap(prime_marker, SyntaxKind::MathPrimes); + + // Will not be continued, so need to wrap the prime as attachment. + if p.at(stop) { + p.wrap(m, SyntaxKind::MathAttach); + } + + primed = true; + continue; + } + + // Separate primes and superscripts to different attachments. + if primed && p.current() == SyntaxKind::Hat { + p.wrap(m, SyntaxKind::MathAttach); + } + + let Some((kind, stop, assoc, mut prec)) = math_op(p.current()) else { + // No attachments, so we need to wrap primes as attachment. + if primed { + p.wrap(m, SyntaxKind::MathAttach); + } + + break; + }; + + if primed && kind == SyntaxKind::MathFrac { + p.wrap(m, SyntaxKind::MathAttach); + } + + if prec < min_prec { + break; + } + + match assoc { + ast::Assoc::Left => prec += 1, + ast::Assoc::Right => {} + } + + if kind == SyntaxKind::MathFrac { + math_unparen(p, m); + } + + p.eat(); + let m2 = p.marker(); + math_expr_prec(p, prec, stop); + math_unparen(p, m2); + + if p.eat_if(SyntaxKind::Underscore) || (!primed && p.eat_if(SyntaxKind::Hat)) { + let m3 = p.marker(); + math_expr_prec(p, prec, SyntaxKind::Eof); + math_unparen(p, m3); + } + + p.wrap(m, kind); + } +} + +fn maybe_delimited(p: &mut Parser, allow_fence: bool) -> bool { + if allow_fence && math_class(p.current_text()) == Some(MathClass::Fence) { + math_delimited(p, MathClass::Fence); + true + } else if math_class(p.current_text()) == Some(MathClass::Opening) { + math_delimited(p, MathClass::Closing); + true + } else { + false + } +} + +fn math_delimited(p: &mut Parser, stop: MathClass) { + let m = p.marker(); + p.eat(); + let m2 = p.marker(); + while !p.eof() && !p.at(SyntaxKind::Dollar) { + let class = math_class(p.current_text()); + if stop == MathClass::Fence && class == Some(MathClass::Closing) { + break; + } + + if class == Some(stop) { + p.wrap(m2, SyntaxKind::Math); + p.eat(); + p.wrap(m, SyntaxKind::MathDelimited); + return; + } + + let prev = p.prev_end(); + math_expr(p); + if !p.progress(prev) { + p.unexpected(); + } + } + + p.wrap(m, SyntaxKind::Math); +} + +fn math_unparen(p: &mut Parser, m: Marker) { + let Some(node) = p.nodes.get_mut(m.0) else { return }; + if node.kind() != SyntaxKind::MathDelimited { + return; + } + + if let [first, .., last] = node.children_mut() { + if first.text() == "(" && last.text() == ")" { + first.convert_to_kind(SyntaxKind::LeftParen); + last.convert_to_kind(SyntaxKind::RightParen); + } + } + + node.convert_to_kind(SyntaxKind::Math); +} + +fn math_class(text: &str) -> Option<MathClass> { + match text { + "[|" => return Some(MathClass::Opening), + "|]" => return Some(MathClass::Closing), + "||" => return Some(MathClass::Fence), + _ => {} + } + + let mut chars = text.chars(); + chars + .next() + .filter(|_| chars.next().is_none()) + .and_then(unicode_math_class::class) +} + +fn math_op(kind: SyntaxKind) -> Option<(SyntaxKind, SyntaxKind, ast::Assoc, usize)> { + match kind { + SyntaxKind::Underscore => { + Some((SyntaxKind::MathAttach, SyntaxKind::Hat, ast::Assoc::Right, 2)) + } + SyntaxKind::Hat => { + Some((SyntaxKind::MathAttach, SyntaxKind::Underscore, ast::Assoc::Right, 2)) + } + SyntaxKind::Slash => { + Some((SyntaxKind::MathFrac, SyntaxKind::Eof, ast::Assoc::Left, 1)) + } + _ => None, + } +} + +fn math_args(p: &mut Parser) { + let m = p.marker(); + p.convert(SyntaxKind::LeftParen); + + let mut namable = true; + let mut named = None; + let mut has_arrays = false; + let mut array = p.marker(); + let mut arg = p.marker(); + + while !p.eof() && !p.at(SyntaxKind::Dollar) { + if namable + && (p.at(SyntaxKind::MathIdent) || p.at(SyntaxKind::Text)) + && p.text[p.current_end()..].starts_with(':') + { + p.convert(SyntaxKind::Ident); + p.convert(SyntaxKind::Colon); + named = Some(arg); + arg = p.marker(); + array = p.marker(); + } + + match p.current_text() { + ")" => break, + ";" => { + maybe_wrap_in_math(p, arg, named); + p.wrap(array, SyntaxKind::Array); + p.convert(SyntaxKind::Semicolon); + array = p.marker(); + arg = p.marker(); + namable = true; + named = None; + has_arrays = true; + continue; + } + "," => { + maybe_wrap_in_math(p, arg, named); + p.convert(SyntaxKind::Comma); + arg = p.marker(); + namable = true; + if named.is_some() { + array = p.marker(); + named = None; + } + continue; + } + _ => {} + } + + let prev = p.prev_end(); + math_expr(p); + if !p.progress(prev) { + p.unexpected(); + } + + namable = false; + } + + if arg != p.marker() { + maybe_wrap_in_math(p, arg, named); + if named.is_some() { + array = p.marker(); + } + } + + if has_arrays && array != p.marker() { + p.wrap(array, SyntaxKind::Array); + } + + if p.at(SyntaxKind::Text) && p.current_text() == ")" { + p.convert(SyntaxKind::RightParen); + } else { + p.expected("closing paren"); + p.balanced = false; + } + + p.wrap(m, SyntaxKind::Args); +} + +fn maybe_wrap_in_math(p: &mut Parser, arg: Marker, named: Option<Marker>) { + let exprs = p.post_process(arg).filter(|node| node.is::<ast::Expr>()).count(); + if exprs != 1 { + p.wrap(arg, SyntaxKind::Math); + } + + if let Some(m) = named { + p.wrap(m, SyntaxKind::Named); + } +} + +fn code(p: &mut Parser, stop: impl FnMut(&Parser) -> bool) { + let m = p.marker(); + code_exprs(p, stop); + p.wrap(m, SyntaxKind::Code); +} + +fn code_exprs(p: &mut Parser, mut stop: impl FnMut(&Parser) -> bool) { + while !p.eof() && !stop(p) { + p.stop_at_newline(true); + let prev = p.prev_end(); + code_expr(p); + if p.progress(prev) && !p.eof() && !stop(p) && !p.eat_if(SyntaxKind::Semicolon) { + p.expected("semicolon or line break"); + } + p.unstop(); + if !p.progress(prev) && !p.eof() { + p.unexpected(); + } + } +} + +fn code_expr(p: &mut Parser) { + code_expr_prec(p, false, 0, false) +} + +fn code_expr_or_pattern(p: &mut Parser) { + code_expr_prec(p, false, 0, true) +} + +fn embedded_code_expr(p: &mut Parser) { + p.stop_at_newline(true); + p.enter(LexMode::Code); + p.assert(SyntaxKind::Hashtag); + p.unskip(); + + let stmt = matches!( + p.current(), + SyntaxKind::Let + | SyntaxKind::Set + | SyntaxKind::Show + | SyntaxKind::Import + | SyntaxKind::Include + ); + + let prev = p.prev_end(); + code_expr_prec(p, true, 0, false); + + // Consume error for things like `#12p` or `#"abc\"`.# + if !p.progress(prev) { + if p.current().is_trivia() { + // p.unskip(); + } else if !p.eof() { + p.unexpected(); + } + } + + let semi = + (stmt || p.directly_at(SyntaxKind::Semicolon)) && p.eat_if(SyntaxKind::Semicolon); + + if stmt && !semi && !p.eof() && !p.at(SyntaxKind::RightBracket) { + p.expected("semicolon or line break"); + } + + p.exit(); + p.unstop(); +} + +fn code_expr_prec( + p: &mut Parser, + atomic: bool, + min_prec: usize, + allow_destructuring: bool, +) { + let m = p.marker(); + if let (false, Some(op)) = (atomic, ast::UnOp::from_kind(p.current())) { + p.eat(); + code_expr_prec(p, atomic, op.precedence(), false); + p.wrap(m, SyntaxKind::Unary); + } else { + code_primary(p, atomic, allow_destructuring); + } + + loop { + if p.directly_at(SyntaxKind::LeftParen) || p.directly_at(SyntaxKind::LeftBracket) + { + args(p); + p.wrap(m, SyntaxKind::FuncCall); + continue; + } + + let at_field_or_method = + p.directly_at(SyntaxKind::Dot) && p.lexer.clone().next() == SyntaxKind::Ident; + + if atomic && !at_field_or_method { + break; + } + + if p.eat_if(SyntaxKind::Dot) { + p.expect(SyntaxKind::Ident); + p.wrap(m, SyntaxKind::FieldAccess); + continue; + } + + let binop = + if ast::BinOp::NotIn.precedence() >= min_prec && p.eat_if(SyntaxKind::Not) { + if p.at(SyntaxKind::In) { + Some(ast::BinOp::NotIn) + } else { + p.expected("keyword `in`"); + break; + } + } else { + ast::BinOp::from_kind(p.current()) + }; + + if let Some(op) = binop { + let mut prec = op.precedence(); + if prec < min_prec { + break; + } + + match op.assoc() { + ast::Assoc::Left => prec += 1, + ast::Assoc::Right => {} + } + + p.eat(); + code_expr_prec(p, false, prec, false); + p.wrap(m, SyntaxKind::Binary); + continue; + } + + break; + } +} + +fn code_primary(p: &mut Parser, atomic: bool, allow_destructuring: bool) { + let m = p.marker(); + match p.current() { + SyntaxKind::Ident => { + p.eat(); + if !atomic && p.at(SyntaxKind::Arrow) { + p.wrap(m, SyntaxKind::Params); + p.assert(SyntaxKind::Arrow); + code_expr(p); + p.wrap(m, SyntaxKind::Closure); + } + } + SyntaxKind::Underscore if !atomic => { + p.eat(); + if p.at(SyntaxKind::Arrow) { + p.wrap(m, SyntaxKind::Params); + p.eat(); + code_expr(p); + p.wrap(m, SyntaxKind::Closure); + } else if let Some(underscore) = p.node_mut(m) { + underscore.convert_to_error("expected expression, found underscore"); + } + } + + SyntaxKind::LeftBrace => code_block(p), + SyntaxKind::LeftBracket => content_block(p), + SyntaxKind::LeftParen => with_paren(p, allow_destructuring), + SyntaxKind::Dollar => equation(p), + SyntaxKind::Let => let_binding(p), + SyntaxKind::Set => set_rule(p), + SyntaxKind::Show => show_rule(p), + SyntaxKind::If => conditional(p), + SyntaxKind::While => while_loop(p), + SyntaxKind::For => for_loop(p), + SyntaxKind::Import => module_import(p), + SyntaxKind::Include => module_include(p), + SyntaxKind::Break => break_stmt(p), + SyntaxKind::Continue => continue_stmt(p), + SyntaxKind::Return => return_stmt(p), + + SyntaxKind::None + | SyntaxKind::Auto + | SyntaxKind::Int + | SyntaxKind::Float + | SyntaxKind::Bool + | SyntaxKind::Numeric + | SyntaxKind::Str + | SyntaxKind::Label + | SyntaxKind::Raw => p.eat(), + + _ => p.expected("expression"), + } +} + +fn block(p: &mut Parser) { + match p.current() { + SyntaxKind::LeftBracket => content_block(p), + SyntaxKind::LeftBrace => code_block(p), + _ => p.expected("block"), + } +} + +pub(super) fn reparse_block(text: &str, range: Range<usize>) -> Option<SyntaxNode> { + let mut p = Parser::new(text, range.start, LexMode::Code); + assert!(p.at(SyntaxKind::LeftBracket) || p.at(SyntaxKind::LeftBrace)); + block(&mut p); + (p.balanced && p.prev_end() == range.end) + .then(|| p.finish().into_iter().next().unwrap()) +} + +fn code_block(p: &mut Parser) { + let m = p.marker(); + p.enter(LexMode::Code); + p.stop_at_newline(false); + p.assert(SyntaxKind::LeftBrace); + code(p, |p| { + p.at(SyntaxKind::RightBrace) + || p.at(SyntaxKind::RightBracket) + || p.at(SyntaxKind::RightParen) + }); + p.expect_closing_delimiter(m, SyntaxKind::RightBrace); + p.exit(); + p.unstop(); + p.wrap(m, SyntaxKind::CodeBlock); +} + +fn content_block(p: &mut Parser) { + let m = p.marker(); + p.enter(LexMode::Markup); + p.assert(SyntaxKind::LeftBracket); + markup(p, true, 0, |p| p.at(SyntaxKind::RightBracket)); + p.expect_closing_delimiter(m, SyntaxKind::RightBracket); + p.exit(); + p.wrap(m, SyntaxKind::ContentBlock); +} + +fn with_paren(p: &mut Parser, allow_destructuring: bool) { + let m = p.marker(); + let mut kind = collection(p, true); + if p.at(SyntaxKind::Arrow) { + validate_params_at(p, m); + p.wrap(m, SyntaxKind::Params); + p.assert(SyntaxKind::Arrow); + code_expr(p); + kind = SyntaxKind::Closure; + } else if p.at(SyntaxKind::Eq) && kind != SyntaxKind::Parenthesized { + // TODO: add warning if p.at(SyntaxKind::Eq) && kind == SyntaxKind::Parenthesized + + validate_pattern_at(p, m, false); + p.wrap(m, SyntaxKind::Destructuring); + p.assert(SyntaxKind::Eq); + code_expr(p); + kind = SyntaxKind::DestructAssignment; + } + + match kind { + SyntaxKind::Array if !allow_destructuring => validate_array_at(p, m), + SyntaxKind::Dict if !allow_destructuring => validate_dict_at(p, m), + SyntaxKind::Parenthesized if !allow_destructuring => { + validate_parenthesized_at(p, m) + } + SyntaxKind::Destructuring if !allow_destructuring => { + invalidate_destructuring(p, m) + } + _ => {} + } + p.wrap(m, kind); +} + +fn invalidate_destructuring(p: &mut Parser, m: Marker) { + let mut collection_kind = Option::None; + for child in p.post_process(m) { + match child.kind() { + SyntaxKind::Named | SyntaxKind::Keyed => match collection_kind { + Some(SyntaxKind::Array) => child.convert_to_error(eco_format!( + "expected expression, found {}", + child.kind().name() + )), + _ => collection_kind = Some(SyntaxKind::Dict), + }, + SyntaxKind::LeftParen | SyntaxKind::RightParen | SyntaxKind::Comma => {} + kind => match collection_kind { + Some(SyntaxKind::Dict) => child.convert_to_error(eco_format!( + "expected named or keyed pair, found {}", + kind.name() + )), + _ => collection_kind = Some(SyntaxKind::Array), + }, + } + } +} + +fn collection(p: &mut Parser, keyed: bool) -> SyntaxKind { + p.stop_at_newline(false); + + let m = p.marker(); + p.assert(SyntaxKind::LeftParen); + + let mut count = 0; + let mut parenthesized = true; + let mut kind = None; + if keyed && p.eat_if(SyntaxKind::Colon) { + kind = Some(SyntaxKind::Dict); + parenthesized = false; + } + + while !p.current().is_terminator() { + let prev = p.prev_end(); + match item(p, keyed) { + SyntaxKind::Spread => parenthesized = false, + SyntaxKind::Named | SyntaxKind::Keyed => { + match kind { + Some(SyntaxKind::Array) => kind = Some(SyntaxKind::Destructuring), + _ => kind = Some(SyntaxKind::Dict), + } + parenthesized = false; + } + SyntaxKind::Int => match kind { + Some(SyntaxKind::Array) | None => kind = Some(SyntaxKind::Array), + Some(_) => kind = Some(SyntaxKind::Destructuring), + }, + _ if kind.is_none() => kind = Some(SyntaxKind::Array), + _ => {} + } + + if !p.progress(prev) { + p.unexpected(); + continue; + } + + count += 1; + + if p.current().is_terminator() { + break; + } + + if p.expect(SyntaxKind::Comma) { + parenthesized = false; + } + } + + p.expect_closing_delimiter(m, SyntaxKind::RightParen); + p.unstop(); + + if parenthesized && count == 1 { + SyntaxKind::Parenthesized + } else { + kind.unwrap_or(SyntaxKind::Array) + } +} + +fn item(p: &mut Parser, keyed: bool) -> SyntaxKind { + let m = p.marker(); + + if p.eat_if(SyntaxKind::Dots) { + if p.at(SyntaxKind::Comma) || p.at(SyntaxKind::RightParen) { + p.wrap(m, SyntaxKind::Spread); + return SyntaxKind::Spread; + } + + code_expr(p); + p.wrap(m, SyntaxKind::Spread); + return SyntaxKind::Spread; + } + + if p.at(SyntaxKind::Underscore) { + // This is a temporary workaround to fix `v.map(_ => {})`. + let mut lexer = p.lexer.clone(); + let next = + std::iter::from_fn(|| Some(lexer.next())).find(|kind| !kind.is_trivia()); + if next != Some(SyntaxKind::Arrow) { + p.eat(); + return SyntaxKind::Underscore; + } + } + + code_expr_or_pattern(p); + + if !p.eat_if(SyntaxKind::Colon) { + return SyntaxKind::Int; + } + + if !p.eat_if(SyntaxKind::Underscore) { + code_expr(p); + } + + let kind = match p.node(m).map(SyntaxNode::kind) { + Some(SyntaxKind::Ident) => SyntaxKind::Named, + Some(SyntaxKind::Str) if keyed => SyntaxKind::Keyed, + _ => { + for child in p.post_process(m) { + if child.kind() == SyntaxKind::Colon { + break; + } + + let mut message = EcoString::from("expected identifier"); + if keyed { + message.push_str(" or string"); + } + message.push_str(", found "); + message.push_str(child.kind().name()); + child.convert_to_error(message); + } + SyntaxKind::Named + } + }; + + p.wrap(m, kind); + kind +} + +fn args(p: &mut Parser) { + if !p.at(SyntaxKind::LeftParen) && !p.at(SyntaxKind::LeftBracket) { + p.expected("argument list"); + } + + let m = p.marker(); + if p.at(SyntaxKind::LeftParen) { + collection(p, false); + validate_args_at(p, m); + } + + while p.directly_at(SyntaxKind::LeftBracket) { + content_block(p); + } + + p.wrap(m, SyntaxKind::Args); +} + +enum PatternKind { + Ident, + Placeholder, + Destructuring, +} + +fn pattern(p: &mut Parser) -> PatternKind { + let m = p.marker(); + if p.at(SyntaxKind::LeftParen) { + let kind = collection(p, false); + validate_pattern_at(p, m, true); + + if kind == SyntaxKind::Parenthesized { + PatternKind::Ident + } else { + p.wrap(m, SyntaxKind::Destructuring); + PatternKind::Destructuring + } + } else if p.eat_if(SyntaxKind::Underscore) { + PatternKind::Placeholder + } else { + p.expect(SyntaxKind::Ident); + PatternKind::Ident + } +} + +fn let_binding(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Let); + + let m2 = p.marker(); + let mut closure = false; + let mut destructuring = false; + match pattern(p) { + PatternKind::Ident => { + closure = p.directly_at(SyntaxKind::LeftParen); + if closure { + let m3 = p.marker(); + collection(p, false); + validate_params_at(p, m3); + p.wrap(m3, SyntaxKind::Params); + } + } + PatternKind::Placeholder => {} + PatternKind::Destructuring => destructuring = true, + } + + let f = if closure || destructuring { Parser::expect } else { Parser::eat_if }; + if f(p, SyntaxKind::Eq) { + code_expr(p); + } + + if closure { + p.wrap(m2, SyntaxKind::Closure); + } + + p.wrap(m, SyntaxKind::LetBinding); +} + +fn set_rule(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Set); + + let m2 = p.marker(); + p.expect(SyntaxKind::Ident); + while p.eat_if(SyntaxKind::Dot) { + p.expect(SyntaxKind::Ident); + p.wrap(m2, SyntaxKind::FieldAccess); + } + + args(p); + if p.eat_if(SyntaxKind::If) { + code_expr(p); + } + p.wrap(m, SyntaxKind::SetRule); +} + +fn show_rule(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Show); + let m2 = p.before_trivia(); + + if !p.at(SyntaxKind::Colon) { + code_expr(p); + } + + if p.eat_if(SyntaxKind::Colon) { + code_expr(p); + } else { + p.expected_at(m2, "colon"); + } + + p.wrap(m, SyntaxKind::ShowRule); +} + +fn conditional(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::If); + code_expr(p); + block(p); + if p.eat_if(SyntaxKind::Else) { + if p.at(SyntaxKind::If) { + conditional(p); + } else { + block(p); + } + } + p.wrap(m, SyntaxKind::Conditional); +} + +fn while_loop(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::While); + code_expr(p); + block(p); + p.wrap(m, SyntaxKind::WhileLoop); +} + +fn for_loop(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::For); + pattern(p); + if p.at(SyntaxKind::Comma) { + p.expected("keyword `in`"); + p.hint("did you mean to use a destructuring pattern?"); + if !p.eat_if(SyntaxKind::Ident) { + p.eat_if(SyntaxKind::Underscore); + } + p.eat_if(SyntaxKind::In); + } else { + p.expect(SyntaxKind::In); + } + code_expr(p); + block(p); + p.wrap(m, SyntaxKind::ForLoop); +} + +fn module_import(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Import); + code_expr(p); + if p.eat_if(SyntaxKind::Colon) && !p.eat_if(SyntaxKind::Star) { + import_items(p); + } + p.wrap(m, SyntaxKind::ModuleImport); +} + +fn import_items(p: &mut Parser) { + let m = p.marker(); + while !p.eof() && !p.at(SyntaxKind::Semicolon) { + if !p.eat_if(SyntaxKind::Ident) { + p.unexpected(); + } + if p.current().is_terminator() { + break; + } + p.expect(SyntaxKind::Comma); + } + p.wrap(m, SyntaxKind::ImportItems); +} + +fn module_include(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Include); + code_expr(p); + p.wrap(m, SyntaxKind::ModuleInclude); +} + +fn break_stmt(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Break); + p.wrap(m, SyntaxKind::LoopBreak); +} + +fn continue_stmt(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Continue); + p.wrap(m, SyntaxKind::LoopContinue); +} + +fn return_stmt(p: &mut Parser) { + let m = p.marker(); + p.assert(SyntaxKind::Return); + if !p.current().is_terminator() && !p.at(SyntaxKind::Comma) { + code_expr(p); + } + p.wrap(m, SyntaxKind::FuncReturn); +} + +fn validate_parenthesized_at(p: &mut Parser, m: Marker) { + for child in p.post_process(m) { + let kind = child.kind(); + match kind { + SyntaxKind::Array => validate_array(child.children_mut().iter_mut()), + SyntaxKind::Dict => validate_dict(child.children_mut().iter_mut()), + SyntaxKind::Underscore => { + child.convert_to_error(eco_format!( + "expected expression, found {}", + kind.name() + )); + } + _ => {} + } + } +} + +fn validate_array_at(p: &mut Parser, m: Marker) { + validate_array(p.post_process(m)) +} + +fn validate_array<'a>(children: impl Iterator<Item = &'a mut SyntaxNode>) { + for child in children { + let kind = child.kind(); + match kind { + SyntaxKind::Array => validate_array(child.children_mut().iter_mut()), + SyntaxKind::Dict => validate_dict(child.children_mut().iter_mut()), + SyntaxKind::Named | SyntaxKind::Keyed | SyntaxKind::Underscore => { + child.convert_to_error(eco_format!( + "expected expression, found {}", + kind.name() + )); + } + _ => {} + } + } +} + +fn validate_dict_at(p: &mut Parser, m: Marker) { + validate_dict(p.post_process(m)) +} + +fn validate_dict<'a>(children: impl Iterator<Item = &'a mut SyntaxNode>) { + let mut used = HashSet::new(); + for child in children { + match child.kind() { + SyntaxKind::Named | SyntaxKind::Keyed => { + let Some(first) = child.children_mut().first_mut() else { continue }; + let key = match first.cast::<ast::Str>() { + Some(str) => str.get(), + None => first.text().clone(), + }; + + if !used.insert(key.clone()) { + first.convert_to_error(eco_format!("duplicate key: {key}")); + child.make_erroneous(); + } + } + SyntaxKind::Spread => {} + SyntaxKind::LeftParen + | SyntaxKind::RightParen + | SyntaxKind::Comma + | SyntaxKind::Colon + | SyntaxKind::Space => {} + kind => { + child.convert_to_error(eco_format!( + "expected named or keyed pair, found {}", + kind.name() + )); + } + } + } +} + +fn validate_params_at(p: &mut Parser, m: Marker) { + let mut used_spread = false; + let mut used = HashSet::new(); + for child in p.post_process(m) { + match child.kind() { + SyntaxKind::Ident => { + if !used.insert(child.text().clone()) { + child.convert_to_error(eco_format!( + "duplicate parameter: {}", + child.text() + )); + } + } + SyntaxKind::Named => { + let Some(within) = child.children_mut().first_mut() else { return }; + if !used.insert(within.text().clone()) { + within.convert_to_error(eco_format!( + "duplicate parameter: {}", + within.text() + )); + child.make_erroneous(); + } + } + SyntaxKind::Spread => { + let Some(within) = child.children_mut().last_mut() else { continue }; + if used_spread { + child.convert_to_error("only one argument sink is allowed"); + continue; + } + used_spread = true; + if within.kind() == SyntaxKind::Dots { + continue; + } else if within.kind() != SyntaxKind::Ident { + within.convert_to_error(eco_format!( + "expected identifier, found {}", + within.kind().name(), + )); + child.make_erroneous(); + continue; + } + if !used.insert(within.text().clone()) { + within.convert_to_error(eco_format!( + "duplicate parameter: {}", + within.text() + )); + child.make_erroneous(); + } + } + SyntaxKind::Array | SyntaxKind::Dict | SyntaxKind::Destructuring => { + validate_pattern(child.children_mut().iter_mut(), &mut used, false); + child.convert_to_kind(SyntaxKind::Destructuring); + } + SyntaxKind::LeftParen + | SyntaxKind::RightParen + | SyntaxKind::Comma + | SyntaxKind::Underscore => {} + kind => { + child.convert_to_error(eco_format!( + "expected identifier, named pair or argument sink, found {}", + kind.name() + )); + } + } + } +} + +fn validate_args_at(p: &mut Parser, m: Marker) { + let mut used = HashSet::new(); + for child in p.post_process(m) { + if child.kind() == SyntaxKind::Named { + let Some(within) = child.children_mut().first_mut() else { return }; + if !used.insert(within.text().clone()) { + within.convert_to_error(eco_format!( + "duplicate argument: {}", + within.text() + )); + child.make_erroneous(); + } + } else if child.kind() == SyntaxKind::Underscore { + child.convert_to_error("unexpected underscore"); + } + } +} + +fn validate_pattern_at(p: &mut Parser, m: Marker, forbid_expressions: bool) { + let mut used = HashSet::new(); + validate_pattern(p.post_process(m), &mut used, forbid_expressions); +} + +fn validate_pattern<'a>( + children: impl Iterator<Item = &'a mut SyntaxNode>, + used: &mut HashSet<EcoString>, + forbid_expressions: bool, +) { + let mut used_spread = false; + for child in children { + match child.kind() { + SyntaxKind::Ident => { + if !used.insert(child.text().clone()) { + child.convert_to_error( + "at most one binding per identifier is allowed", + ); + } + } + SyntaxKind::Spread => { + let Some(within) = child.children_mut().last_mut() else { continue }; + if used_spread { + child.convert_to_error("at most one destructuring sink is allowed"); + continue; + } + used_spread = true; + + if within.kind() == SyntaxKind::Dots { + continue; + } else if forbid_expressions && within.kind() != SyntaxKind::Ident { + within.convert_to_error(eco_format!( + "expected identifier, found {}", + within.kind().name(), + )); + child.make_erroneous(); + continue; + } + + if !used.insert(within.text().clone()) { + within.convert_to_error( + "at most one binding per identifier is allowed", + ); + child.make_erroneous(); + } + } + SyntaxKind::Named => { + let Some(within) = child.children_mut().first_mut() else { return }; + if !used.insert(within.text().clone()) { + within.convert_to_error( + "at most one binding per identifier is allowed", + ); + child.make_erroneous(); + } + + if forbid_expressions { + let Some(within) = child.children_mut().last_mut() else { return }; + if within.kind() != SyntaxKind::Ident + && within.kind() != SyntaxKind::Underscore + { + within.convert_to_error(eco_format!( + "expected identifier, found {}", + within.kind().name(), + )); + child.make_erroneous(); + } + } + } + SyntaxKind::LeftParen + | SyntaxKind::RightParen + | SyntaxKind::Comma + | SyntaxKind::Underscore => {} + kind => { + if forbid_expressions { + child.convert_to_error(eco_format!( + "expected identifier or destructuring sink, found {}", + kind.name() + )); + } + } + } + } +} + +/// Manages parsing of a stream of tokens. +struct Parser<'s> { + text: &'s str, + lexer: Lexer<'s>, + prev_end: usize, + current_start: usize, + current: SyntaxKind, + modes: Vec<LexMode>, + nodes: Vec<SyntaxNode>, + stop_at_newline: Vec<bool>, + balanced: bool, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +struct Marker(usize); + +impl<'s> Parser<'s> { + fn new(text: &'s str, offset: usize, mode: LexMode) -> Self { + let mut lexer = Lexer::new(text, mode); + lexer.jump(offset); + let current = lexer.next(); + Self { + lexer, + text, + prev_end: offset, + current_start: offset, + current, + modes: vec![], + nodes: vec![], + stop_at_newline: vec![], + balanced: true, + } + } + + fn finish(self) -> Vec<SyntaxNode> { + self.nodes + } + + fn prev_end(&self) -> usize { + self.prev_end + } + + fn current(&self) -> SyntaxKind { + self.current + } + + fn current_start(&self) -> usize { + self.current_start + } + + fn current_end(&self) -> usize { + self.lexer.cursor() + } + + fn current_text(&self) -> &'s str { + &self.text[self.current_start..self.current_end()] + } + + fn at(&self, kind: SyntaxKind) -> bool { + self.current == kind + } + + #[track_caller] + fn assert(&mut self, kind: SyntaxKind) { + assert_eq!(self.current, kind); + self.eat(); + } + + fn eof(&self) -> bool { + self.at(SyntaxKind::Eof) + } + + fn directly_at(&self, kind: SyntaxKind) -> bool { + self.current == kind && self.prev_end == self.current_start + } + + /// Eats if at `kind`. + /// + /// Note: In math and code mode, this will ignore trivia in front of the + /// `kind`, To forbid skipping trivia, consider using `eat_if_direct`. + fn eat_if(&mut self, kind: SyntaxKind) -> bool { + let at = self.at(kind); + if at { + self.eat(); + } + at + } + + /// Eats only if currently at the start of `kind`. + fn eat_if_direct(&mut self, kind: SyntaxKind) -> bool { + let at = self.directly_at(kind); + if at { + self.eat(); + } + at + } + + fn convert(&mut self, kind: SyntaxKind) { + self.current = kind; + self.eat(); + } + + fn newline(&mut self) -> bool { + self.lexer.newline() + } + + fn column(&self, at: usize) -> usize { + self.text[..at].chars().rev().take_while(|&c| !is_newline(c)).count() + } + + fn marker(&self) -> Marker { + Marker(self.nodes.len()) + } + + fn node(&self, m: Marker) -> Option<&SyntaxNode> { + self.nodes.get(m.0) + } + + fn node_mut(&mut self, m: Marker) -> Option<&mut SyntaxNode> { + self.nodes.get_mut(m.0) + } + + fn post_process(&mut self, m: Marker) -> impl Iterator<Item = &mut SyntaxNode> { + self.nodes[m.0..] + .iter_mut() + .filter(|child| !child.kind().is_error() && !child.kind().is_trivia()) + } + + fn wrap(&mut self, from: Marker, kind: SyntaxKind) { + self.wrap_within(from, self.before_trivia(), kind); + } + + fn wrap_all(&mut self, from: Marker, kind: SyntaxKind) { + self.wrap_within(from, Marker(self.nodes.len()), kind) + } + + fn wrap_within(&mut self, from: Marker, to: Marker, kind: SyntaxKind) { + let len = self.nodes.len(); + let to = to.0.min(len); + let from = from.0.min(to); + let children = self.nodes.drain(from..to).collect(); + self.nodes.insert(from, SyntaxNode::inner(kind, children)); + } + + fn progress(&self, offset: usize) -> bool { + offset < self.prev_end + } + + fn enter(&mut self, mode: LexMode) { + self.modes.push(self.lexer.mode()); + self.lexer.set_mode(mode); + } + + fn exit(&mut self) { + let mode = self.modes.pop().unwrap(); + if mode != self.lexer.mode() { + self.unskip(); + self.lexer.set_mode(mode); + self.lexer.jump(self.current_start); + self.lex(); + self.skip(); + } + } + + fn stop_at_newline(&mut self, stop: bool) { + self.stop_at_newline.push(stop); + } + + fn unstop(&mut self) { + self.unskip(); + self.stop_at_newline.pop(); + self.lexer.jump(self.prev_end); + self.lex(); + self.skip(); + } + + fn eat(&mut self) { + self.save(); + self.lex(); + self.skip(); + } + + fn skip(&mut self) { + if self.lexer.mode() != LexMode::Markup { + while self.current.is_trivia() { + self.save(); + self.lex(); + } + } + } + + fn unskip(&mut self) { + if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { + while self.nodes.last().map_or(false, |last| last.kind().is_trivia()) { + self.nodes.pop(); + } + + self.lexer.jump(self.prev_end); + self.lex(); + } + } + + fn save(&mut self) { + let text = self.current_text(); + if self.at(SyntaxKind::Error) { + let message = self.lexer.take_error().unwrap(); + self.nodes.push(SyntaxNode::error(message, text)); + } else { + self.nodes.push(SyntaxNode::leaf(self.current, text)); + } + + if self.lexer.mode() == LexMode::Markup || !self.current.is_trivia() { + self.prev_end = self.current_end(); + } + } + + fn lex(&mut self) { + self.current_start = self.lexer.cursor(); + self.current = self.lexer.next(); + if self.lexer.mode() == LexMode::Code + && self.lexer.newline() + && self.stop_at_newline.last().copied().unwrap_or(false) + && !matches!(self.lexer.clone().next(), SyntaxKind::Else | SyntaxKind::Dot) + { + self.current = SyntaxKind::Eof; + } + } +} + +impl<'s> Parser<'s> { + /// Consume the given syntax `kind` or produce an error. + fn expect(&mut self, kind: SyntaxKind) -> bool { + let at = self.at(kind); + if at { + self.eat(); + } else if kind == SyntaxKind::Ident && self.current.is_keyword() { + self.expected_found(kind.name(), self.current.name()); + } else { + self.balanced &= !kind.is_grouping(); + self.expected(kind.name()); + } + at + } + + /// Produce an error that the given `thing` was expected. + fn expected(&mut self, thing: &str) { + if !self.after_error() { + self.expected_at(self.before_trivia(), thing); + } + } + + /// Produce an error that the given `thing` was expected but another + /// thing was `found` and consume the next token. + fn expected_found(&mut self, thing: &str, found: &str) { + self.trim_errors(); + self.convert_to_error(eco_format!("expected {thing}, found {found}")); + } + + /// Produce an error that the given `thing` was expected at the position + /// of the marker `m`. + fn expected_at(&mut self, m: Marker, thing: &str) { + let message = eco_format!("expected {}", thing); + let error = SyntaxNode::error(message, ""); + self.nodes.insert(m.0, error); + } + + /// Produce an error for the unclosed delimiter `kind` at the position + /// `open`. + fn expect_closing_delimiter(&mut self, open: Marker, kind: SyntaxKind) { + if !self.eat_if(kind) { + self.nodes[open.0].convert_to_error("unclosed delimiter"); + } + } + + /// Consume the next token (if any) and produce an error stating that it was + /// unexpected. + fn unexpected(&mut self) { + self.trim_errors(); + self.convert_to_error(eco_format!("unexpected {}", self.current.name())); + } + + /// Consume the next token and turn it into an error. + fn convert_to_error(&mut self, message: EcoString) { + let kind = self.current; + let offset = self.nodes.len(); + self.eat(); + self.balanced &= !kind.is_grouping(); + if !kind.is_error() { + self.nodes[offset].convert_to_error(message); + } + } + + /// Adds a hint to the last node, if the last node is an error. + fn hint(&mut self, hint: impl Into<EcoString>) { + let m = self.before_trivia(); + if m.0 > 0 { + self.nodes[m.0 - 1].hint(hint); + } + } + + /// Get a marker after the last non-trivia node. + fn before_trivia(&self) -> Marker { + let mut i = self.nodes.len(); + if self.lexer.mode() != LexMode::Markup && self.prev_end != self.current_start { + while i > 0 && self.nodes[i - 1].kind().is_trivia() { + i -= 1; + } + } + Marker(i) + } + + /// Whether the last non-trivia node is an error. + fn after_error(&mut self) -> bool { + let m = self.before_trivia(); + m.0 > 0 && self.nodes[m.0 - 1].kind().is_error() + } + + /// Remove trailing errors with zero length. + fn trim_errors(&mut self) { + let Marker(end) = self.before_trivia(); + let mut start = end; + while start > 0 + && self.nodes[start - 1].kind().is_error() + && self.nodes[start - 1].is_empty() + { + start -= 1; + } + self.nodes.drain(start..end); + } +} diff --git a/crates/typst-syntax/src/reparser.rs b/crates/typst-syntax/src/reparser.rs new file mode 100644 index 00000000..a4186fa7 --- /dev/null +++ b/crates/typst-syntax/src/reparser.rs @@ -0,0 +1,322 @@ +use std::ops::Range; + +use super::{ + is_newline, parse, reparse_block, reparse_markup, Span, SyntaxKind, SyntaxNode, +}; + +/// Refresh the given syntax node with as little parsing as possible. +/// +/// Takes the new text, the range in the old text that was replaced and the +/// length of the replacement and returns the range in the new text that was +/// ultimately reparsed. +/// +/// The high-level API for this function is +/// [`Source::edit`](super::Source::edit). +pub fn reparse( + root: &mut SyntaxNode, + text: &str, + replaced: Range<usize>, + replacement_len: usize, +) -> Range<usize> { + try_reparse(text, replaced, replacement_len, None, root, 0).unwrap_or_else(|| { + let id = root.span().id(); + *root = parse(text); + root.numberize(id, Span::FULL).unwrap(); + 0..text.len() + }) +} + +/// Try to reparse inside the given node. +fn try_reparse( + text: &str, + replaced: Range<usize>, + replacement_len: usize, + parent_kind: Option<SyntaxKind>, + node: &mut SyntaxNode, + offset: usize, +) -> Option<Range<usize>> { + // The range of children which overlap with the edit. + #[allow(clippy::reversed_empty_ranges)] + let mut overlap = usize::MAX..0; + let mut cursor = offset; + let node_kind = node.kind(); + + for (i, child) in node.children_mut().iter_mut().enumerate() { + let prev_range = cursor..cursor + child.len(); + let prev_len = child.len(); + let prev_desc = child.descendants(); + + // Does the child surround the edit? + // If so, try to reparse within it or itself. + if !child.is_leaf() && includes(&prev_range, &replaced) { + let new_len = prev_len + replacement_len - replaced.len(); + let new_range = cursor..cursor + new_len; + + // Try to reparse within the child. + if let Some(range) = try_reparse( + text, + replaced.clone(), + replacement_len, + Some(node_kind), + child, + cursor, + ) { + assert_eq!(child.len(), new_len); + let new_desc = child.descendants(); + node.update_parent(prev_len, new_len, prev_desc, new_desc); + return Some(range); + } + + // If the child is a block, try to reparse the block. + if child.kind().is_block() { + if let Some(newborn) = reparse_block(text, new_range.clone()) { + return node + .replace_children(i..i + 1, vec![newborn]) + .is_ok() + .then_some(new_range); + } + } + } + + // Does the child overlap with the edit? + if overlaps(&prev_range, &replaced) { + overlap.start = overlap.start.min(i); + overlap.end = i + 1; + } + + // Is the child beyond the edit? + if replaced.end < cursor { + break; + } + + cursor += child.len(); + } + + // Try to reparse a range of markup expressions within markup. This is only + // possible if the markup is top-level or contained in a block, not if it is + // contained in things like headings or lists because too much can go wrong + // with indent and line breaks. + if overlap.is_empty() + || node.kind() != SyntaxKind::Markup + || !matches!(parent_kind, None | Some(SyntaxKind::ContentBlock)) + { + return None; + } + + let children = node.children_mut(); + + // Reparse a segment. Retries until it works, taking exponentially more + // children into account. + let mut expansion = 1; + loop { + // Add slack in both directions. + let mut start = overlap.start.saturating_sub(expansion.max(2)); + let mut end = (overlap.end + expansion).min(children.len()); + + // Expand to the left. + while start > 0 && expand(&children[start]) { + start -= 1; + } + + // Expand to the right. + while end < children.len() && expand(&children[end]) { + end += 1; + } + + // Also take hashtag. + if start > 0 && children[start - 1].kind() == SyntaxKind::Hashtag { + start -= 1; + } + + // Synthesize what `at_start` and `nesting` would be at the start of the + // reparse. + let mut prefix_len = 0; + let mut nesting = 0; + let mut at_start = true; + for child in &children[..start] { + prefix_len += child.len(); + next_at_start(child, &mut at_start); + next_nesting(child, &mut nesting); + } + + // Determine what `at_start` will have to be at the end of the reparse. + let mut prev_len = 0; + let mut prev_at_start_after = at_start; + let mut prev_nesting_after = nesting; + for child in &children[start..end] { + prev_len += child.len(); + next_at_start(child, &mut prev_at_start_after); + next_nesting(child, &mut prev_nesting_after); + } + + // Determine the range in the new text that we want to reparse. + let shifted = offset + prefix_len; + let new_len = prev_len + replacement_len - replaced.len(); + let new_range = shifted..shifted + new_len; + let at_end = end == children.len(); + + // Stop parsing early if this kind is encountered. + let stop_kind = match parent_kind { + Some(_) => SyntaxKind::RightBracket, + None => SyntaxKind::Eof, + }; + + // Reparse! + let reparsed = reparse_markup( + text, + new_range.clone(), + &mut at_start, + &mut nesting, + |kind| kind == stop_kind, + ); + + if let Some(newborns) = reparsed { + // If more children follow, at_start must match its previous value. + // Similarly, if we children follow or we not top-level the nesting + // must match its previous value. + if (at_end || at_start == prev_at_start_after) + && ((at_end && parent_kind.is_none()) || nesting == prev_nesting_after) + { + return node + .replace_children(start..end, newborns) + .is_ok() + .then_some(new_range); + } + } + + // If it didn't even work with all children, we give up. + if start == 0 && at_end { + break; + } + + // Exponential expansion to both sides. + expansion *= 2; + } + + None +} + +/// Whether the inner range is fully contained in the outer one (no touching). +fn includes(outer: &Range<usize>, inner: &Range<usize>) -> bool { + outer.start < inner.start && outer.end > inner.end +} + +/// Whether the first and second range overlap or touch. +fn overlaps(first: &Range<usize>, second: &Range<usize>) -> bool { + (first.start <= second.start && second.start <= first.end) + || (second.start <= first.start && first.start <= second.end) +} + +/// Whether the selection should be expanded beyond a node of this kind. +fn expand(node: &SyntaxNode) -> bool { + let kind = node.kind(); + kind.is_trivia() + || kind.is_error() + || kind == SyntaxKind::Semicolon + || node.text() == "/" + || node.text() == ":" +} + +/// Whether `at_start` would still be true after this node given the +/// previous value of the property. +fn next_at_start(node: &SyntaxNode, at_start: &mut bool) { + let kind = node.kind(); + if kind.is_trivia() { + *at_start |= kind == SyntaxKind::Parbreak + || (kind == SyntaxKind::Space && node.text().chars().any(is_newline)); + } else { + *at_start = false; + } +} + +/// Update `nesting` based on the node. +fn next_nesting(node: &SyntaxNode, nesting: &mut usize) { + if node.kind() == SyntaxKind::Text { + match node.text().as_str() { + "[" => *nesting += 1, + "]" if *nesting > 0 => *nesting -= 1, + _ => {} + } + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use super::super::{parse, Source, Span}; + + #[track_caller] + fn test(prev: &str, range: Range<usize>, with: &str, incremental: bool) { + let mut source = Source::detached(prev); + let prev = source.root().clone(); + let range = source.edit(range, with); + let mut found = source.root().clone(); + let mut expected = parse(source.text()); + found.synthesize(Span::detached()); + expected.synthesize(Span::detached()); + if found != expected { + eprintln!("source: {:?}", source.text()); + eprintln!("previous: {prev:#?}"); + eprintln!("expected: {expected:#?}"); + eprintln!("found: {found:#?}"); + panic!("test failed"); + } + if incremental { + assert_ne!(source.len_bytes(), range.len(), "should have been incremental"); + } else { + assert_eq!( + source.len_bytes(), + range.len(), + "shouldn't have been incremental" + ); + } + } + + #[test] + fn test_reparse_markup() { + test("abc~def~gh~", 5..6, "+", true); + test("~~~~~~~", 3..4, "A", true); + test("abc~~", 1..2, "", true); + test("#var. hello", 5..6, " ", false); + test("#var;hello", 9..10, "a", false); + test("https:/world", 7..7, "/", false); + test("hello world", 7..12, "walkers", false); + test("some content", 0..12, "", false); + test("", 0..0, "do it", false); + test("a d e", 1..3, " b c d", false); + test("~*~*~", 2..2, "*", false); + test("::1\n2. a\n3", 7..7, "4", true); + test("* #{1+2} *", 6..7, "3", true); + test("#{(0, 1, 2)}", 6..7, "11pt", true); + test("\n= A heading", 4..4, "n evocative", false); + test("#call() abc~d", 7..7, "[]", true); + test("a your thing a", 6..7, "a", false); + test("#grid(columns: (auto, 1fr, 40%))", 16..20, "4pt", false); + test("abc\n= a heading\njoke", 3..4, "\nmore\n\n", true); + test("#show f: a => b..", 16..16, "c", false); + test("#for", 4..4, "//", false); + test("a\n#let \nb", 7..7, "i", true); + test(r"#{{let x = z}; a = 1} b", 7..7, "//", false); + test(r#"a ```typst hello```"#, 16..17, "", false); + } + + #[test] + fn test_reparse_block() { + test("Hello #{ x + 1 }!", 9..10, "abc", true); + test("A#{}!", 3..3, "\"", false); + test("#{ [= x] }!", 5..5, "=", true); + test("#[[]]", 3..3, "\\", true); + test("#[[ab]]", 4..5, "\\", true); + test("#{}}", 2..2, "{", false); + test("A: #[BC]", 6..6, "{", true); + test("A: #[BC]", 6..6, "#{", true); + test("A: #[BC]", 6..6, "#{}", true); + test("#{\"ab\"}A", 5..5, "c", true); + test("#{\"ab\"}A", 5..6, "c", false); + test("a#[]b", 3..3, "#{", true); + test("a#{call(); abc}b", 8..8, "[]", true); + test("a #while x {\n g(x) \n} b", 12..12, "//", true); + test("a#[]b", 3..3, "[hey]", true); + } +} diff --git a/crates/typst-syntax/src/source.rs b/crates/typst-syntax/src/source.rs new file mode 100644 index 00000000..25b3b86c --- /dev/null +++ b/crates/typst-syntax/src/source.rs @@ -0,0 +1,423 @@ +//! Source file management. + +use std::fmt::{self, Debug, Formatter}; +use std::hash::{Hash, Hasher}; +use std::ops::Range; +use std::sync::Arc; + +use comemo::Prehashed; + +use super::reparser::reparse; +use super::{is_newline, parse, FileId, LinkedNode, Span, SyntaxNode}; + +/// A source file. +/// +/// All line and column indices start at zero, just like byte indices. Only for +/// user-facing display, you should add 1 to them. +/// +/// Values of this type are cheap to clone and hash. +#[derive(Clone)] +pub struct Source(Arc<Repr>); + +/// The internal representation. +#[derive(Clone)] +struct Repr { + id: FileId, + text: Prehashed<String>, + root: Prehashed<SyntaxNode>, + lines: Vec<Line>, +} + +impl Source { + /// Create a new source file. + #[tracing::instrument(skip_all)] + pub fn new(id: FileId, text: String) -> Self { + let mut root = parse(&text); + root.numberize(id, Span::FULL).unwrap(); + Self(Arc::new(Repr { + id, + lines: lines(&text), + text: Prehashed::new(text), + root: Prehashed::new(root), + })) + } + + /// Create a source file without a real id and path, usually for testing. + pub fn detached(text: impl Into<String>) -> Self { + Self::new(FileId::detached(), text.into()) + } + + /// Create a source file with the same synthetic span for all nodes. + pub fn synthesized(text: String, span: Span) -> Self { + let mut root = parse(&text); + root.synthesize(span); + Self(Arc::new(Repr { + id: FileId::detached(), + lines: lines(&text), + text: Prehashed::new(text), + root: Prehashed::new(root), + })) + } + + /// The root node of the file's untyped syntax tree. + pub fn root(&self) -> &SyntaxNode { + &self.0.root + } + + /// The id of the source file. + pub fn id(&self) -> FileId { + self.0.id + } + + /// The whole source as a string slice. + pub fn text(&self) -> &str { + &self.0.text + } + + /// Slice out the part of the source code enclosed by the range. + pub fn get(&self, range: Range<usize>) -> Option<&str> { + self.text().get(range) + } + + /// Fully replace the source text. + pub fn replace(&mut self, text: String) { + let inner = Arc::make_mut(&mut self.0); + inner.text = Prehashed::new(text); + inner.lines = lines(&inner.text); + let mut root = parse(&inner.text); + root.numberize(inner.id, Span::FULL).unwrap(); + inner.root = Prehashed::new(root); + } + + /// Edit the source file by replacing the given range. + /// + /// Returns the range in the new source that was ultimately reparsed. + /// + /// The method panics if the `replace` range is out of bounds. + #[track_caller] + pub fn edit(&mut self, replace: Range<usize>, with: &str) -> Range<usize> { + let start_byte = replace.start; + let start_utf16 = self.byte_to_utf16(start_byte).unwrap(); + let line = self.byte_to_line(start_byte).unwrap(); + + let inner = Arc::make_mut(&mut self.0); + + // Update the text itself. + inner.text.update(|text| text.replace_range(replace.clone(), with)); + + // Remove invalidated line starts. + inner.lines.truncate(line + 1); + + // Handle adjoining of \r and \n. + if inner.text[..start_byte].ends_with('\r') && with.starts_with('\n') { + inner.lines.pop(); + } + + // Recalculate the line starts after the edit. + inner.lines.extend(lines_from( + start_byte, + start_utf16, + &inner.text[start_byte..], + )); + + // Incrementally reparse the replaced range. + inner + .root + .update(|root| reparse(root, &inner.text, replace, with.len())) + } + + /// Get the length of the file in UTF-8 encoded bytes. + pub fn len_bytes(&self) -> usize { + self.text().len() + } + + /// Get the length of the file in UTF-16 code units. + pub fn len_utf16(&self) -> usize { + let last = self.0.lines.last().unwrap(); + last.utf16_idx + len_utf16(&self.0.text[last.byte_idx..]) + } + + /// Get the length of the file in lines. + pub fn len_lines(&self) -> usize { + self.0.lines.len() + } + + /// Find the node with the given span. + /// + /// Returns `None` if the span does not point into this source file. + pub fn find(&self, span: Span) -> Option<LinkedNode<'_>> { + LinkedNode::new(self.root()).find(span) + } + + /// Get the byte range for the given span in this file. + /// + /// Panics if the span does not point into this source file. + #[track_caller] + pub fn range(&self, span: Span) -> Range<usize> { + self.find(span) + .expect("span does not point into this source file") + .range() + } + + /// Return the index of the UTF-16 code unit at the byte index. + pub fn byte_to_utf16(&self, byte_idx: usize) -> Option<usize> { + let line_idx = self.byte_to_line(byte_idx)?; + let line = self.0.lines.get(line_idx)?; + let head = self.0.text.get(line.byte_idx..byte_idx)?; + Some(line.utf16_idx + len_utf16(head)) + } + + /// Return the index of the line that contains the given byte index. + pub fn byte_to_line(&self, byte_idx: usize) -> Option<usize> { + (byte_idx <= self.0.text.len()).then(|| { + match self.0.lines.binary_search_by_key(&byte_idx, |line| line.byte_idx) { + Ok(i) => i, + Err(i) => i - 1, + } + }) + } + + /// Return the index of the column at the byte index. + /// + /// The column is defined as the number of characters in the line before the + /// byte index. + pub fn byte_to_column(&self, byte_idx: usize) -> Option<usize> { + let line = self.byte_to_line(byte_idx)?; + let start = self.line_to_byte(line)?; + let head = self.get(start..byte_idx)?; + Some(head.chars().count()) + } + + /// Return the byte index at the UTF-16 code unit. + pub fn utf16_to_byte(&self, utf16_idx: usize) -> Option<usize> { + let line = self.0.lines.get( + match self.0.lines.binary_search_by_key(&utf16_idx, |line| line.utf16_idx) { + Ok(i) => i, + Err(i) => i - 1, + }, + )?; + + let mut k = line.utf16_idx; + for (i, c) in self.0.text[line.byte_idx..].char_indices() { + if k >= utf16_idx { + return Some(line.byte_idx + i); + } + k += c.len_utf16(); + } + + (k == utf16_idx).then_some(self.0.text.len()) + } + + /// Return the byte position at which the given line starts. + pub fn line_to_byte(&self, line_idx: usize) -> Option<usize> { + self.0.lines.get(line_idx).map(|line| line.byte_idx) + } + + /// Return the range which encloses the given line. + pub fn line_to_range(&self, line_idx: usize) -> Option<Range<usize>> { + let start = self.line_to_byte(line_idx)?; + let end = self.line_to_byte(line_idx + 1).unwrap_or(self.0.text.len()); + Some(start..end) + } + + /// Return the byte index of the given (line, column) pair. + /// + /// The column defines the number of characters to go beyond the start of + /// the line. + pub fn line_column_to_byte( + &self, + line_idx: usize, + column_idx: usize, + ) -> Option<usize> { + let range = self.line_to_range(line_idx)?; + let line = self.get(range.clone())?; + let mut chars = line.chars(); + for _ in 0..column_idx { + chars.next(); + } + Some(range.start + (line.len() - chars.as_str().len())) + } +} + +impl Debug for Source { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Source({})", self.id().path().display()) + } +} + +impl Hash for Source { + fn hash<H: Hasher>(&self, state: &mut H) { + self.0.id.hash(state); + self.0.text.hash(state); + self.0.root.hash(state); + } +} + +impl AsRef<str> for Source { + fn as_ref(&self) -> &str { + self.text() + } +} + +/// Metadata about a line. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +struct Line { + /// The UTF-8 byte offset where the line starts. + byte_idx: usize, + /// The UTF-16 codepoint offset where the line starts. + utf16_idx: usize, +} + +/// Create a line vector. +fn lines(text: &str) -> Vec<Line> { + std::iter::once(Line { byte_idx: 0, utf16_idx: 0 }) + .chain(lines_from(0, 0, text)) + .collect() +} + +/// Compute a line iterator from an offset. +fn lines_from( + byte_offset: usize, + utf16_offset: usize, + text: &str, +) -> impl Iterator<Item = Line> + '_ { + let mut s = unscanny::Scanner::new(text); + let mut utf16_idx = utf16_offset; + + std::iter::from_fn(move || { + s.eat_until(|c: char| { + utf16_idx += c.len_utf16(); + is_newline(c) + }); + + if s.done() { + return None; + } + + if s.eat() == Some('\r') && s.eat_if('\n') { + utf16_idx += 1; + } + + Some(Line { byte_idx: byte_offset + s.cursor(), utf16_idx }) + }) +} + +/// The number of code units this string would use if it was encoded in +/// UTF16. This runs in linear time. +fn len_utf16(string: &str) -> usize { + string.chars().map(char::len_utf16).sum() +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST: &str = "ä\tcde\nf💛g\r\nhi\rjkl"; + + #[test] + fn test_source_file_new() { + let source = Source::detached(TEST); + assert_eq!( + source.0.lines, + [ + Line { byte_idx: 0, utf16_idx: 0 }, + Line { byte_idx: 7, utf16_idx: 6 }, + Line { byte_idx: 15, utf16_idx: 12 }, + Line { byte_idx: 18, utf16_idx: 15 }, + ] + ); + } + + #[test] + fn test_source_file_pos_to_line() { + let source = Source::detached(TEST); + assert_eq!(source.byte_to_line(0), Some(0)); + assert_eq!(source.byte_to_line(2), Some(0)); + assert_eq!(source.byte_to_line(6), Some(0)); + assert_eq!(source.byte_to_line(7), Some(1)); + assert_eq!(source.byte_to_line(8), Some(1)); + assert_eq!(source.byte_to_line(12), Some(1)); + assert_eq!(source.byte_to_line(21), Some(3)); + assert_eq!(source.byte_to_line(22), None); + } + + #[test] + fn test_source_file_pos_to_column() { + let source = Source::detached(TEST); + assert_eq!(source.byte_to_column(0), Some(0)); + assert_eq!(source.byte_to_column(2), Some(1)); + assert_eq!(source.byte_to_column(6), Some(5)); + assert_eq!(source.byte_to_column(7), Some(0)); + assert_eq!(source.byte_to_column(8), Some(1)); + assert_eq!(source.byte_to_column(12), Some(2)); + } + + #[test] + fn test_source_file_utf16() { + #[track_caller] + fn roundtrip(source: &Source, byte_idx: usize, utf16_idx: usize) { + let middle = source.byte_to_utf16(byte_idx).unwrap(); + let result = source.utf16_to_byte(middle).unwrap(); + assert_eq!(middle, utf16_idx); + assert_eq!(result, byte_idx); + } + + let source = Source::detached(TEST); + roundtrip(&source, 0, 0); + roundtrip(&source, 2, 1); + roundtrip(&source, 3, 2); + roundtrip(&source, 8, 7); + roundtrip(&source, 12, 9); + roundtrip(&source, 21, 18); + assert_eq!(source.byte_to_utf16(22), None); + assert_eq!(source.utf16_to_byte(19), None); + } + + #[test] + fn test_source_file_roundtrip() { + #[track_caller] + fn roundtrip(source: &Source, byte_idx: usize) { + let line = source.byte_to_line(byte_idx).unwrap(); + let column = source.byte_to_column(byte_idx).unwrap(); + let result = source.line_column_to_byte(line, column).unwrap(); + assert_eq!(result, byte_idx); + } + + let source = Source::detached(TEST); + roundtrip(&source, 0); + roundtrip(&source, 7); + roundtrip(&source, 12); + roundtrip(&source, 21); + } + + #[test] + fn test_source_file_edit() { + // This tests only the non-parser parts. The reparsing itself is + // tested separately. + #[track_caller] + fn test(prev: &str, range: Range<usize>, with: &str, after: &str) { + let mut source = Source::detached(prev); + let result = Source::detached(after); + source.edit(range, with); + assert_eq!(source.text(), result.text()); + assert_eq!(source.0.lines, result.0.lines); + } + + // Test inserting at the beginning. + test("abc\n", 0..0, "hi\n", "hi\nabc\n"); + test("\nabc", 0..0, "hi\r", "hi\r\nabc"); + + // Test editing in the middle. + test(TEST, 4..16, "❌", "ä\tc❌i\rjkl"); + + // Test appending. + test("abc\ndef", 7..7, "hi", "abc\ndefhi"); + test("abc\ndef\n", 8..8, "hi", "abc\ndef\nhi"); + + // Test appending with adjoining \r and \n. + test("abc\ndef\r", 8..8, "\nghi", "abc\ndef\r\nghi"); + + // Test removing everything. + test(TEST, 0..21, "", ""); + } +} diff --git a/crates/typst-syntax/src/span.rs b/crates/typst-syntax/src/span.rs new file mode 100644 index 00000000..8715e476 --- /dev/null +++ b/crates/typst-syntax/src/span.rs @@ -0,0 +1,128 @@ +use std::fmt::{self, Debug, Formatter}; +use std::num::NonZeroU64; +use std::ops::Range; + +use super::FileId; + +/// A unique identifier for a syntax node. +/// +/// This is used throughout the compiler to track which source section an error +/// or element stems from. Can be [mapped back](super::Source::range) to a byte +/// range for user facing display. +/// +/// During editing, the span values stay mostly stable, even for nodes behind an +/// insertion. This is not true for simple ranges as they would shift. Spans can +/// be used as inputs to memoized functions without hurting cache performance +/// when text is inserted somewhere in the document other than the end. +/// +/// Span ids are ordered in the syntax tree to enable quickly finding the node +/// with some id: +/// - The id of a parent is always smaller than the ids of any of its children. +/// - The id of a node is always greater than any id in the subtrees of any left +/// sibling and smaller than any id in the subtrees of any right sibling. +/// +/// This type takes up 8 bytes and is null-optimized (i.e. `Option<Span>` also +/// takes 8 bytes). +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub struct Span(NonZeroU64); + +impl Span { + /// The full range of numbers available for span numbering. + pub const FULL: Range<u64> = 2..(1 << Self::BITS); + const DETACHED: u64 = 1; + + // Data layout: + // | 16 bits source id | 48 bits number | + const BITS: usize = 48; + + /// Create a new span from a source id and a unique number. + /// + /// Panics if the `number` is not contained in `FULL`. + #[track_caller] + pub const fn new(id: FileId, number: u64) -> Self { + assert!( + Self::FULL.start <= number && number < Self::FULL.end, + "span number outside valid range" + ); + + Self::pack(id, number) + } + + /// A span that does not point into any source file. + pub const fn detached() -> Self { + Self::pack(FileId::detached(), Self::DETACHED) + } + + /// Pack the components into a span. + #[track_caller] + const fn pack(id: FileId, number: u64) -> Span { + let bits = ((id.as_u16() as u64) << Self::BITS) | number; + match NonZeroU64::new(bits) { + Some(v) => Self(v), + None => panic!("span encoding is zero"), + } + } + + /// The id of the source file the span points into. + pub const fn id(self) -> FileId { + FileId::from_u16((self.0.get() >> Self::BITS) as u16) + } + + /// The unique number of the span within its source file. + pub const fn number(self) -> u64 { + self.0.get() & ((1 << Self::BITS) - 1) + } + + /// Whether the span is detached. + pub const fn is_detached(self) -> bool { + self.id().is_detached() + } +} + +/// A value with a span locating it in the source code. +#[derive(Copy, Clone, Eq, PartialEq, Hash)] +pub struct Spanned<T> { + /// The spanned value. + pub v: T, + /// The value's location in source code. + pub span: Span, +} + +impl<T> Spanned<T> { + /// Create a new instance from a value and its span. + pub fn new(v: T, span: Span) -> Self { + Self { v, span } + } + + /// Convert from `&Spanned<T>` to `Spanned<&T>` + pub fn as_ref(&self) -> Spanned<&T> { + Spanned { v: &self.v, span: self.span } + } + + /// Map the value using a function. + pub fn map<F, U>(self, f: F) -> Spanned<U> + where + F: FnOnce(T) -> U, + { + Spanned { v: f(self.v), span: self.span } + } +} + +impl<T: Debug> Debug for Spanned<T> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + self.v.fmt(f) + } +} + +#[cfg(test)] +mod tests { + use super::{FileId, Span}; + + #[test] + fn test_span_encoding() { + let id = FileId::from_u16(5); + let span = Span::new(id, 10); + assert_eq!(span.id(), id); + assert_eq!(span.number(), 10); + } +} |
