diff options
| author | Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com> | 2024-03-01 17:17:41 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-03-01 09:17:41 +0000 |
| commit | 030041466b5b8453ca23e43a6385f4592f78a56c (patch) | |
| tree | 7af9f2d34c349980881a2b9908a5ad8decce1616 /crates/typst-syntax | |
| parent | 57ab6d09248ba036e7feb32f8b9527ec643f826c (diff) | |
Provide more fine-grained spans in raw blocks (#3257)
Co-authored-by: Laurenz <laurmaedje@gmail.com>
Diffstat (limited to 'crates/typst-syntax')
| -rw-r--r-- | crates/typst-syntax/src/ast.rs | 100 | ||||
| -rw-r--r-- | crates/typst-syntax/src/highlight.rs | 3 | ||||
| -rw-r--r-- | crates/typst-syntax/src/kind.rs | 9 | ||||
| -rw-r--r-- | crates/typst-syntax/src/lexer.rs | 134 | ||||
| -rw-r--r-- | crates/typst-syntax/src/parser.rs | 22 | ||||
| -rw-r--r-- | crates/typst-syntax/src/set.rs | 16 |
6 files changed, 201 insertions, 83 deletions
diff --git a/crates/typst-syntax/src/ast.rs b/crates/typst-syntax/src/ast.rs index 8f8eaac4..fc689a68 100644 --- a/crates/typst-syntax/src/ast.rs +++ b/crates/typst-syntax/src/ast.rs @@ -8,9 +8,7 @@ use std::ops::Deref; use ecow::EcoString; use unscanny::Scanner; -use crate::{ - is_id_continue, is_id_start, is_newline, split_newlines, Span, SyntaxKind, SyntaxNode, -}; +use crate::{is_newline, Span, SyntaxKind, SyntaxNode}; /// A typed AST node. pub trait AstNode<'a>: Sized { @@ -558,87 +556,51 @@ node! { } impl<'a> Raw<'a> { - /// The trimmed raw text. - pub fn text(self) -> EcoString { - let mut text = self.0.text().as_str(); - let blocky = text.starts_with("```"); - text = text.trim_matches('`'); - - // Trim tag, one space at the start, and one space at the end if the - // last non-whitespace char is a backtick. - if blocky { - let mut s = Scanner::new(text); - if s.eat_if(is_id_start) { - s.eat_while(is_id_continue); - } - text = s.after(); - text = text.strip_prefix(' ').unwrap_or(text); - if text.trim_end().ends_with('`') { - text = text.strip_suffix(' ').unwrap_or(text); - } - } - - // Split into lines. - let mut lines = split_newlines(text); - - if blocky { - let dedent = lines - .iter() - .skip(1) - .filter(|line| !line.chars().all(char::is_whitespace)) - // The line with the closing ``` is always taken into account - .chain(lines.last()) - .map(|line| line.chars().take_while(|c| c.is_whitespace()).count()) - .min() - .unwrap_or(0); - - // Dedent based on column, but not for the first line. - for line in lines.iter_mut().skip(1) { - let offset = line.chars().take(dedent).map(char::len_utf8).sum(); - *line = &line[offset..]; - } - - let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); - - // Trims a sequence of whitespace followed by a newline at the start. - if lines.first().is_some_and(is_whitespace) { - lines.remove(0); - } - - // Trims a newline followed by a sequence of whitespace at the end. - if lines.last().is_some_and(is_whitespace) { - lines.pop(); - } - } - - lines.join("\n").into() + /// The lines in the raw block. + pub fn lines(self) -> impl DoubleEndedIterator<Item = Text<'a>> { + self.0.children().filter_map(SyntaxNode::cast) } /// An optional identifier specifying the language to syntax-highlight in. - pub fn lang(self) -> Option<&'a str> { - let text = self.0.text(); - + pub fn lang(self) -> Option<RawLang<'a>> { // Only blocky literals are supposed to contain a language. - if !text.starts_with("```") { + let delim: RawDelim = self.0.cast_first_match()?; + if delim.0.len() < 3 { return Option::None; } - let inner = text.trim_start_matches('`'); - let mut s = Scanner::new(inner); - s.eat_if(is_id_start).then(|| { - s.eat_while(is_id_continue); - s.before() - }) + self.0.cast_first_match() } /// Whether the raw text should be displayed in a separate block. pub fn block(self) -> bool { - let text = self.0.text(); - text.starts_with("```") && text.chars().any(is_newline) + self.0 + .cast_first_match() + .is_some_and(|delim: RawDelim| delim.0.len() >= 3) + && self.0.children().any(|e| { + e.kind() == SyntaxKind::RawTrimmed && e.text().chars().any(is_newline) + }) } } node! { + /// A language tag at the start of raw element: ``typ ``. + RawLang +} + +impl<'a> RawLang<'a> { + /// Get the language tag. + pub fn get(self) -> &'a EcoString { + self.0.text() + } +} + +node! { + /// A raw delimiter in single or 3+ backticks: `` ` ``. + RawDelim +} + +node! { /// A hyperlink: `https://typst.org`. Link } diff --git a/crates/typst-syntax/src/highlight.rs b/crates/typst-syntax/src/highlight.rs index 19d35d0a..f1c8a298 100644 --- a/crates/typst-syntax/src/highlight.rs +++ b/crates/typst-syntax/src/highlight.rs @@ -153,6 +153,9 @@ pub fn highlight(node: &LinkedNode) -> Option<Tag> { SyntaxKind::Strong => Some(Tag::Strong), SyntaxKind::Emph => Some(Tag::Emph), SyntaxKind::Raw => Some(Tag::Raw), + SyntaxKind::RawLang => None, + SyntaxKind::RawTrimmed => None, + SyntaxKind::RawDelim => None, SyntaxKind::Link => Some(Tag::Link), SyntaxKind::Label => Some(Tag::Label), SyntaxKind::Ref => Some(Tag::Ref), diff --git a/crates/typst-syntax/src/kind.rs b/crates/typst-syntax/src/kind.rs index e5dd4e9b..c34f6002 100644 --- a/crates/typst-syntax/src/kind.rs +++ b/crates/typst-syntax/src/kind.rs @@ -28,6 +28,12 @@ pub enum SyntaxKind { Emph, /// Raw text with optional syntax highlighting: `` `...` ``. Raw, + /// A language tag at the start of raw text: ``typ ``. + RawLang, + /// A raw delimiter consisting of 1 or 3+ backticks: `` ` ``. + RawDelim, + /// A sequence of whitespace to ignore in a raw block: ` `. + RawTrimmed, /// A hyperlink: `https://typst.org`. Link, /// A label: `<intro>`. @@ -369,6 +375,9 @@ impl SyntaxKind { Self::Strong => "strong content", Self::Emph => "emphasized content", Self::Raw => "raw block", + Self::RawLang => "raw language tag", + Self::RawTrimmed => "raw trimmed", + Self::RawDelim => "raw delimiter", Self::Link => "link", Self::Label => "label", Self::Ref => "reference", diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 300a8353..aacbee62 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -16,6 +16,8 @@ pub(super) struct Lexer<'s> { mode: LexMode, /// Whether the last token contained a newline. newline: bool, + /// The state held by raw line lexing. + raw: Vec<(SyntaxKind, usize)>, /// An error for the last token. error: Option<EcoString>, } @@ -29,6 +31,8 @@ pub(super) enum LexMode { Math, /// Keywords, literals and operators. Code, + /// The contents of a raw block. + Raw, } impl<'s> Lexer<'s> { @@ -40,6 +44,7 @@ impl<'s> Lexer<'s> { mode, newline: false, error: None, + raw: Vec::new(), } } @@ -86,6 +91,14 @@ impl Lexer<'_> { /// Shared. impl Lexer<'_> { pub fn next(&mut self) -> SyntaxKind { + if self.mode == LexMode::Raw { + let Some((kind, end)) = self.raw.pop() else { + return SyntaxKind::Eof; + }; + self.s.jump(end); + return kind; + } + self.newline = false; self.error = None; let start = self.s.cursor(); @@ -101,6 +114,7 @@ impl Lexer<'_> { LexMode::Markup => self.markup(start, c), LexMode::Math => self.math(start, c), LexMode::Code => self.code(start, c), + LexMode::Raw => unreachable!(), }, None => SyntaxKind::Eof, @@ -224,15 +238,23 @@ impl Lexer<'_> { } fn raw(&mut self) -> SyntaxKind { + let start = self.s.cursor() - 1; + self.raw.clear(); + + // Determine number of opening backticks. let mut backticks = 1; while self.s.eat_if('`') { backticks += 1; } + // Special case for ``. if backticks == 2 { - return SyntaxKind::Raw; + self.push_raw(SyntaxKind::RawDelim); + self.s.jump(start + 1); + return SyntaxKind::RawDelim; } + // Find end of raw text. let mut found = 0; while found < backticks { match self.s.eat() { @@ -246,12 +268,99 @@ impl Lexer<'_> { return self.error("unclosed raw text"); } - SyntaxKind::Raw + let end = self.s.cursor(); + if backticks >= 3 { + self.blocky_raw(start, end, backticks); + } else { + // Single backtick needs no trimming or extra fancyness. + self.s.jump(end - backticks); + self.push_raw(SyntaxKind::Text); + self.s.jump(end); + } + + // Closing delimiter. + self.push_raw(SyntaxKind::RawDelim); + + // The saved tokens will be removed in reverse. + self.raw.reverse(); + + // Opening delimiter. + self.s.jump(start + backticks); + SyntaxKind::RawDelim + } + + fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) { + // Language tag. + self.s.jump(start + backticks); + if self.s.eat_if(is_id_start) { + self.s.eat_while(is_id_continue); + self.push_raw(SyntaxKind::RawLang); + } + + // Determine inner content between backticks and with trimmed + // single spaces (line trimming comes later). + self.s.eat_if(' '); + let mut inner = self.s.to(end - backticks); + if inner.trim_end().ends_with('`') { + inner = inner.strip_suffix(' ').unwrap_or(inner); + } + + // Determine dedent level. + let lines = split_newlines(inner); + let dedent = lines + .iter() + .skip(1) + .filter(|line| !line.chars().all(char::is_whitespace)) + // The line with the closing ``` is always taken into account + .chain(lines.last()) + .map(|line| line.chars().take_while(|c| c.is_whitespace()).count()) + .min() + .unwrap_or(0); + + let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); + let starts_whitespace = lines.first().is_some_and(is_whitespace); + let ends_whitespace = lines.last().is_some_and(is_whitespace); + + let mut lines = lines.into_iter(); + let mut skipped = false; + + // Trim whitespace + newline at start. + if starts_whitespace { + self.s.advance(lines.next().unwrap().len()); + skipped = true; + } + // Trim whitespace + newline at end. + if ends_whitespace { + lines.next_back(); + } + + // Add lines. + for (i, line) in lines.enumerate() { + let dedent = if i == 0 && !skipped { 0 } else { dedent }; + let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum(); + self.s.eat_newline(); + self.s.advance(offset); + self.push_raw(SyntaxKind::RawTrimmed); + self.s.advance(line.len() - offset); + self.push_raw(SyntaxKind::Text); + } + + // Add final trimmed. + if self.s.cursor() < end - backticks { + self.s.jump(end - backticks); + self.push_raw(SyntaxKind::RawTrimmed); + } + self.s.jump(end); + } + + fn push_raw(&mut self, kind: SyntaxKind) { + let end = self.s.cursor(); + self.raw.push((kind, end)); } fn link(&mut self) -> SyntaxKind { let (link, balanced) = link_prefix(self.s.after()); - self.s.jump(self.s.cursor() + link.len()); + self.s.advance(link.len()); if !balanced { return self.error( @@ -632,6 +741,25 @@ fn keyword(ident: &str) -> Option<SyntaxKind> { }) } +trait ScannerExt { + fn advance(&mut self, by: usize); + fn eat_newline(&mut self) -> bool; +} + +impl ScannerExt for Scanner<'_> { + fn advance(&mut self, by: usize) { + self.jump(self.cursor() + by); + } + + fn eat_newline(&mut self) -> bool { + let ate = self.eat_if(is_newline); + if ate && self.before().ends_with('\r') { + self.eat_if('\n'); + } + ate + } +} + /// Whether a character will become a Space token in Typst #[inline] fn is_space(character: char, mode: LexMode) -> bool { diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs index f4bb19e1..4785b8a1 100644 --- a/crates/typst-syntax/src/parser.rs +++ b/crates/typst-syntax/src/parser.rs @@ -116,13 +116,13 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) { | SyntaxKind::Escape | SyntaxKind::Shorthand | SyntaxKind::SmartQuote - | SyntaxKind::Raw | SyntaxKind::Link | SyntaxKind::Label => p.eat(), SyntaxKind::Hash => embedded_code_expr(p), SyntaxKind::Star => strong(p), SyntaxKind::Underscore => emph(p), + SyntaxKind::RawDelim => raw(p), SyntaxKind::HeadingMarker if *at_start => heading(p), SyntaxKind::ListMarker if *at_start => list_item(p), SyntaxKind::EnumMarker if *at_start => enum_item(p), @@ -172,6 +172,22 @@ fn emph(p: &mut Parser) { p.wrap(m, SyntaxKind::Emph); } +/// Parses raw text with optional syntax highlighting: `` `...` ``. +fn raw(p: &mut Parser) { + let m = p.marker(); + p.enter(LexMode::Raw); + p.assert(SyntaxKind::RawDelim); + + // Eats until the closing delimiter. + while !p.eof() && !p.at(SyntaxKind::RawDelim) { + p.eat(); + } + + p.expect(SyntaxKind::RawDelim); + p.exit(); + p.wrap(m, SyntaxKind::Raw); +} + /// Parses a section heading: `= Introduction`. fn heading(p: &mut Parser) { const END: SyntaxSet = SyntaxSet::new() @@ -747,6 +763,7 @@ fn code_primary(p: &mut Parser, atomic: bool) { SyntaxKind::LeftBrace => code_block(p), SyntaxKind::LeftBracket => content_block(p), SyntaxKind::LeftParen => expr_with_paren(p, atomic), + SyntaxKind::RawDelim => raw(p), SyntaxKind::Dollar => equation(p), SyntaxKind::Let => let_binding(p), SyntaxKind::Set => set_rule(p), @@ -768,8 +785,7 @@ fn code_primary(p: &mut Parser, atomic: bool) { | SyntaxKind::Bool | SyntaxKind::Numeric | SyntaxKind::Str - | SyntaxKind::Label - | SyntaxKind::Raw => p.eat(), + | SyntaxKind::Label => p.eat(), _ => p.expected("expression"), } diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs index 906d5fac..39e64651 100644 --- a/crates/typst-syntax/src/set.rs +++ b/crates/typst-syntax/src/set.rs @@ -15,7 +15,10 @@ impl SyntaxSet { } /// Insert a syntax kind into the set. + /// + /// You can only add kinds with discriminator < 128. pub const fn add(self, kind: SyntaxKind) -> Self { + assert!((kind as u8) < BITS); Self(self.0 | bit(kind)) } @@ -26,10 +29,12 @@ impl SyntaxSet { /// Whether the set contains the given syntax kind. pub const fn contains(&self, kind: SyntaxKind) -> bool { - (self.0 & bit(kind)) != 0 + (kind as u8) < BITS && (self.0 & bit(kind)) != 0 } } +const BITS: u8 = 128; + const fn bit(kind: SyntaxKind) -> u128 { 1 << (kind as usize) } @@ -54,7 +59,7 @@ pub const MARKUP_EXPR: SyntaxSet = SyntaxSet::new() .add(SyntaxKind::Escape) .add(SyntaxKind::Shorthand) .add(SyntaxKind::SmartQuote) - .add(SyntaxKind::Raw) + .add(SyntaxKind::RawDelim) .add(SyntaxKind::Link) .add(SyntaxKind::Label) .add(SyntaxKind::Hash) @@ -119,7 +124,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = SyntaxSet::new() .add(SyntaxKind::Numeric) .add(SyntaxKind::Str) .add(SyntaxKind::Label) - .add(SyntaxKind::Raw); + .add(SyntaxKind::RawDelim); /// Syntax kinds that are unary operators. pub const UNARY_OP: SyntaxSet = SyntaxSet::new() @@ -172,11 +177,6 @@ mod tests { use super::*; #[test] - fn test_size() { - assert!((SyntaxKind::Eof as usize) < 128); - } - - #[test] fn test_set() { let set = SyntaxSet::new().add(SyntaxKind::And).add(SyntaxKind::Or); assert!(set.contains(SyntaxKind::And)); |
