summaryrefslogtreecommitdiff
path: root/crates/typst-syntax
diff options
context:
space:
mode:
authorMyriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com>2024-03-01 17:17:41 +0800
committerGitHub <noreply@github.com>2024-03-01 09:17:41 +0000
commit030041466b5b8453ca23e43a6385f4592f78a56c (patch)
tree7af9f2d34c349980881a2b9908a5ad8decce1616 /crates/typst-syntax
parent57ab6d09248ba036e7feb32f8b9527ec643f826c (diff)
Provide more fine-grained spans in raw blocks (#3257)
Co-authored-by: Laurenz <laurmaedje@gmail.com>
Diffstat (limited to 'crates/typst-syntax')
-rw-r--r--crates/typst-syntax/src/ast.rs100
-rw-r--r--crates/typst-syntax/src/highlight.rs3
-rw-r--r--crates/typst-syntax/src/kind.rs9
-rw-r--r--crates/typst-syntax/src/lexer.rs134
-rw-r--r--crates/typst-syntax/src/parser.rs22
-rw-r--r--crates/typst-syntax/src/set.rs16
6 files changed, 201 insertions, 83 deletions
diff --git a/crates/typst-syntax/src/ast.rs b/crates/typst-syntax/src/ast.rs
index 8f8eaac4..fc689a68 100644
--- a/crates/typst-syntax/src/ast.rs
+++ b/crates/typst-syntax/src/ast.rs
@@ -8,9 +8,7 @@ use std::ops::Deref;
use ecow::EcoString;
use unscanny::Scanner;
-use crate::{
- is_id_continue, is_id_start, is_newline, split_newlines, Span, SyntaxKind, SyntaxNode,
-};
+use crate::{is_newline, Span, SyntaxKind, SyntaxNode};
/// A typed AST node.
pub trait AstNode<'a>: Sized {
@@ -558,87 +556,51 @@ node! {
}
impl<'a> Raw<'a> {
- /// The trimmed raw text.
- pub fn text(self) -> EcoString {
- let mut text = self.0.text().as_str();
- let blocky = text.starts_with("```");
- text = text.trim_matches('`');
-
- // Trim tag, one space at the start, and one space at the end if the
- // last non-whitespace char is a backtick.
- if blocky {
- let mut s = Scanner::new(text);
- if s.eat_if(is_id_start) {
- s.eat_while(is_id_continue);
- }
- text = s.after();
- text = text.strip_prefix(' ').unwrap_or(text);
- if text.trim_end().ends_with('`') {
- text = text.strip_suffix(' ').unwrap_or(text);
- }
- }
-
- // Split into lines.
- let mut lines = split_newlines(text);
-
- if blocky {
- let dedent = lines
- .iter()
- .skip(1)
- .filter(|line| !line.chars().all(char::is_whitespace))
- // The line with the closing ``` is always taken into account
- .chain(lines.last())
- .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
- .min()
- .unwrap_or(0);
-
- // Dedent based on column, but not for the first line.
- for line in lines.iter_mut().skip(1) {
- let offset = line.chars().take(dedent).map(char::len_utf8).sum();
- *line = &line[offset..];
- }
-
- let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
-
- // Trims a sequence of whitespace followed by a newline at the start.
- if lines.first().is_some_and(is_whitespace) {
- lines.remove(0);
- }
-
- // Trims a newline followed by a sequence of whitespace at the end.
- if lines.last().is_some_and(is_whitespace) {
- lines.pop();
- }
- }
-
- lines.join("\n").into()
+ /// The lines in the raw block.
+ pub fn lines(self) -> impl DoubleEndedIterator<Item = Text<'a>> {
+ self.0.children().filter_map(SyntaxNode::cast)
}
/// An optional identifier specifying the language to syntax-highlight in.
- pub fn lang(self) -> Option<&'a str> {
- let text = self.0.text();
-
+ pub fn lang(self) -> Option<RawLang<'a>> {
// Only blocky literals are supposed to contain a language.
- if !text.starts_with("```") {
+ let delim: RawDelim = self.0.cast_first_match()?;
+ if delim.0.len() < 3 {
return Option::None;
}
- let inner = text.trim_start_matches('`');
- let mut s = Scanner::new(inner);
- s.eat_if(is_id_start).then(|| {
- s.eat_while(is_id_continue);
- s.before()
- })
+ self.0.cast_first_match()
}
/// Whether the raw text should be displayed in a separate block.
pub fn block(self) -> bool {
- let text = self.0.text();
- text.starts_with("```") && text.chars().any(is_newline)
+ self.0
+ .cast_first_match()
+ .is_some_and(|delim: RawDelim| delim.0.len() >= 3)
+ && self.0.children().any(|e| {
+ e.kind() == SyntaxKind::RawTrimmed && e.text().chars().any(is_newline)
+ })
}
}
node! {
+ /// A language tag at the start of raw element: ``typ ``.
+ RawLang
+}
+
+impl<'a> RawLang<'a> {
+ /// Get the language tag.
+ pub fn get(self) -> &'a EcoString {
+ self.0.text()
+ }
+}
+
+node! {
+ /// A raw delimiter in single or 3+ backticks: `` ` ``.
+ RawDelim
+}
+
+node! {
/// A hyperlink: `https://typst.org`.
Link
}
diff --git a/crates/typst-syntax/src/highlight.rs b/crates/typst-syntax/src/highlight.rs
index 19d35d0a..f1c8a298 100644
--- a/crates/typst-syntax/src/highlight.rs
+++ b/crates/typst-syntax/src/highlight.rs
@@ -153,6 +153,9 @@ pub fn highlight(node: &LinkedNode) -> Option<Tag> {
SyntaxKind::Strong => Some(Tag::Strong),
SyntaxKind::Emph => Some(Tag::Emph),
SyntaxKind::Raw => Some(Tag::Raw),
+ SyntaxKind::RawLang => None,
+ SyntaxKind::RawTrimmed => None,
+ SyntaxKind::RawDelim => None,
SyntaxKind::Link => Some(Tag::Link),
SyntaxKind::Label => Some(Tag::Label),
SyntaxKind::Ref => Some(Tag::Ref),
diff --git a/crates/typst-syntax/src/kind.rs b/crates/typst-syntax/src/kind.rs
index e5dd4e9b..c34f6002 100644
--- a/crates/typst-syntax/src/kind.rs
+++ b/crates/typst-syntax/src/kind.rs
@@ -28,6 +28,12 @@ pub enum SyntaxKind {
Emph,
/// Raw text with optional syntax highlighting: `` `...` ``.
Raw,
+ /// A language tag at the start of raw text: ``typ ``.
+ RawLang,
+ /// A raw delimiter consisting of 1 or 3+ backticks: `` ` ``.
+ RawDelim,
+ /// A sequence of whitespace to ignore in a raw block: ` `.
+ RawTrimmed,
/// A hyperlink: `https://typst.org`.
Link,
/// A label: `<intro>`.
@@ -369,6 +375,9 @@ impl SyntaxKind {
Self::Strong => "strong content",
Self::Emph => "emphasized content",
Self::Raw => "raw block",
+ Self::RawLang => "raw language tag",
+ Self::RawTrimmed => "raw trimmed",
+ Self::RawDelim => "raw delimiter",
Self::Link => "link",
Self::Label => "label",
Self::Ref => "reference",
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 300a8353..aacbee62 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -16,6 +16,8 @@ pub(super) struct Lexer<'s> {
mode: LexMode,
/// Whether the last token contained a newline.
newline: bool,
+ /// The state held by raw line lexing.
+ raw: Vec<(SyntaxKind, usize)>,
/// An error for the last token.
error: Option<EcoString>,
}
@@ -29,6 +31,8 @@ pub(super) enum LexMode {
Math,
/// Keywords, literals and operators.
Code,
+ /// The contents of a raw block.
+ Raw,
}
impl<'s> Lexer<'s> {
@@ -40,6 +44,7 @@ impl<'s> Lexer<'s> {
mode,
newline: false,
error: None,
+ raw: Vec::new(),
}
}
@@ -86,6 +91,14 @@ impl Lexer<'_> {
/// Shared.
impl Lexer<'_> {
pub fn next(&mut self) -> SyntaxKind {
+ if self.mode == LexMode::Raw {
+ let Some((kind, end)) = self.raw.pop() else {
+ return SyntaxKind::Eof;
+ };
+ self.s.jump(end);
+ return kind;
+ }
+
self.newline = false;
self.error = None;
let start = self.s.cursor();
@@ -101,6 +114,7 @@ impl Lexer<'_> {
LexMode::Markup => self.markup(start, c),
LexMode::Math => self.math(start, c),
LexMode::Code => self.code(start, c),
+ LexMode::Raw => unreachable!(),
},
None => SyntaxKind::Eof,
@@ -224,15 +238,23 @@ impl Lexer<'_> {
}
fn raw(&mut self) -> SyntaxKind {
+ let start = self.s.cursor() - 1;
+ self.raw.clear();
+
+ // Determine number of opening backticks.
let mut backticks = 1;
while self.s.eat_if('`') {
backticks += 1;
}
+ // Special case for ``.
if backticks == 2 {
- return SyntaxKind::Raw;
+ self.push_raw(SyntaxKind::RawDelim);
+ self.s.jump(start + 1);
+ return SyntaxKind::RawDelim;
}
+ // Find end of raw text.
let mut found = 0;
while found < backticks {
match self.s.eat() {
@@ -246,12 +268,99 @@ impl Lexer<'_> {
return self.error("unclosed raw text");
}
- SyntaxKind::Raw
+ let end = self.s.cursor();
+ if backticks >= 3 {
+ self.blocky_raw(start, end, backticks);
+ } else {
+ // Single backtick needs no trimming or extra fancyness.
+ self.s.jump(end - backticks);
+ self.push_raw(SyntaxKind::Text);
+ self.s.jump(end);
+ }
+
+ // Closing delimiter.
+ self.push_raw(SyntaxKind::RawDelim);
+
+ // The saved tokens will be removed in reverse.
+ self.raw.reverse();
+
+ // Opening delimiter.
+ self.s.jump(start + backticks);
+ SyntaxKind::RawDelim
+ }
+
+ fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
+ // Language tag.
+ self.s.jump(start + backticks);
+ if self.s.eat_if(is_id_start) {
+ self.s.eat_while(is_id_continue);
+ self.push_raw(SyntaxKind::RawLang);
+ }
+
+ // Determine inner content between backticks and with trimmed
+ // single spaces (line trimming comes later).
+ self.s.eat_if(' ');
+ let mut inner = self.s.to(end - backticks);
+ if inner.trim_end().ends_with('`') {
+ inner = inner.strip_suffix(' ').unwrap_or(inner);
+ }
+
+ // Determine dedent level.
+ let lines = split_newlines(inner);
+ let dedent = lines
+ .iter()
+ .skip(1)
+ .filter(|line| !line.chars().all(char::is_whitespace))
+ // The line with the closing ``` is always taken into account
+ .chain(lines.last())
+ .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
+ .min()
+ .unwrap_or(0);
+
+ let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
+ let starts_whitespace = lines.first().is_some_and(is_whitespace);
+ let ends_whitespace = lines.last().is_some_and(is_whitespace);
+
+ let mut lines = lines.into_iter();
+ let mut skipped = false;
+
+ // Trim whitespace + newline at start.
+ if starts_whitespace {
+ self.s.advance(lines.next().unwrap().len());
+ skipped = true;
+ }
+ // Trim whitespace + newline at end.
+ if ends_whitespace {
+ lines.next_back();
+ }
+
+ // Add lines.
+ for (i, line) in lines.enumerate() {
+ let dedent = if i == 0 && !skipped { 0 } else { dedent };
+ let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
+ self.s.eat_newline();
+ self.s.advance(offset);
+ self.push_raw(SyntaxKind::RawTrimmed);
+ self.s.advance(line.len() - offset);
+ self.push_raw(SyntaxKind::Text);
+ }
+
+ // Add final trimmed.
+ if self.s.cursor() < end - backticks {
+ self.s.jump(end - backticks);
+ self.push_raw(SyntaxKind::RawTrimmed);
+ }
+ self.s.jump(end);
+ }
+
+ fn push_raw(&mut self, kind: SyntaxKind) {
+ let end = self.s.cursor();
+ self.raw.push((kind, end));
}
fn link(&mut self) -> SyntaxKind {
let (link, balanced) = link_prefix(self.s.after());
- self.s.jump(self.s.cursor() + link.len());
+ self.s.advance(link.len());
if !balanced {
return self.error(
@@ -632,6 +741,25 @@ fn keyword(ident: &str) -> Option<SyntaxKind> {
})
}
+trait ScannerExt {
+ fn advance(&mut self, by: usize);
+ fn eat_newline(&mut self) -> bool;
+}
+
+impl ScannerExt for Scanner<'_> {
+ fn advance(&mut self, by: usize) {
+ self.jump(self.cursor() + by);
+ }
+
+ fn eat_newline(&mut self) -> bool {
+ let ate = self.eat_if(is_newline);
+ if ate && self.before().ends_with('\r') {
+ self.eat_if('\n');
+ }
+ ate
+ }
+}
+
/// Whether a character will become a Space token in Typst
#[inline]
fn is_space(character: char, mode: LexMode) -> bool {
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index f4bb19e1..4785b8a1 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -116,13 +116,13 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
| SyntaxKind::Escape
| SyntaxKind::Shorthand
| SyntaxKind::SmartQuote
- | SyntaxKind::Raw
| SyntaxKind::Link
| SyntaxKind::Label => p.eat(),
SyntaxKind::Hash => embedded_code_expr(p),
SyntaxKind::Star => strong(p),
SyntaxKind::Underscore => emph(p),
+ SyntaxKind::RawDelim => raw(p),
SyntaxKind::HeadingMarker if *at_start => heading(p),
SyntaxKind::ListMarker if *at_start => list_item(p),
SyntaxKind::EnumMarker if *at_start => enum_item(p),
@@ -172,6 +172,22 @@ fn emph(p: &mut Parser) {
p.wrap(m, SyntaxKind::Emph);
}
+/// Parses raw text with optional syntax highlighting: `` `...` ``.
+fn raw(p: &mut Parser) {
+ let m = p.marker();
+ p.enter(LexMode::Raw);
+ p.assert(SyntaxKind::RawDelim);
+
+ // Eats until the closing delimiter.
+ while !p.eof() && !p.at(SyntaxKind::RawDelim) {
+ p.eat();
+ }
+
+ p.expect(SyntaxKind::RawDelim);
+ p.exit();
+ p.wrap(m, SyntaxKind::Raw);
+}
+
/// Parses a section heading: `= Introduction`.
fn heading(p: &mut Parser) {
const END: SyntaxSet = SyntaxSet::new()
@@ -747,6 +763,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
SyntaxKind::LeftBrace => code_block(p),
SyntaxKind::LeftBracket => content_block(p),
SyntaxKind::LeftParen => expr_with_paren(p, atomic),
+ SyntaxKind::RawDelim => raw(p),
SyntaxKind::Dollar => equation(p),
SyntaxKind::Let => let_binding(p),
SyntaxKind::Set => set_rule(p),
@@ -768,8 +785,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
| SyntaxKind::Bool
| SyntaxKind::Numeric
| SyntaxKind::Str
- | SyntaxKind::Label
- | SyntaxKind::Raw => p.eat(),
+ | SyntaxKind::Label => p.eat(),
_ => p.expected("expression"),
}
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index 906d5fac..39e64651 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -15,7 +15,10 @@ impl SyntaxSet {
}
/// Insert a syntax kind into the set.
+ ///
+ /// You can only add kinds with discriminator < 128.
pub const fn add(self, kind: SyntaxKind) -> Self {
+ assert!((kind as u8) < BITS);
Self(self.0 | bit(kind))
}
@@ -26,10 +29,12 @@ impl SyntaxSet {
/// Whether the set contains the given syntax kind.
pub const fn contains(&self, kind: SyntaxKind) -> bool {
- (self.0 & bit(kind)) != 0
+ (kind as u8) < BITS && (self.0 & bit(kind)) != 0
}
}
+const BITS: u8 = 128;
+
const fn bit(kind: SyntaxKind) -> u128 {
1 << (kind as usize)
}
@@ -54,7 +59,7 @@ pub const MARKUP_EXPR: SyntaxSet = SyntaxSet::new()
.add(SyntaxKind::Escape)
.add(SyntaxKind::Shorthand)
.add(SyntaxKind::SmartQuote)
- .add(SyntaxKind::Raw)
+ .add(SyntaxKind::RawDelim)
.add(SyntaxKind::Link)
.add(SyntaxKind::Label)
.add(SyntaxKind::Hash)
@@ -119,7 +124,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = SyntaxSet::new()
.add(SyntaxKind::Numeric)
.add(SyntaxKind::Str)
.add(SyntaxKind::Label)
- .add(SyntaxKind::Raw);
+ .add(SyntaxKind::RawDelim);
/// Syntax kinds that are unary operators.
pub const UNARY_OP: SyntaxSet = SyntaxSet::new()
@@ -172,11 +177,6 @@ mod tests {
use super::*;
#[test]
- fn test_size() {
- assert!((SyntaxKind::Eof as usize) < 128);
- }
-
- #[test]
fn test_set() {
let set = SyntaxSet::new().add(SyntaxKind::And).add(SyntaxKind::Or);
assert!(set.contains(SyntaxKind::And));