Provide more fine-grained spans in raw blocks (#3257)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
author: Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com> 2024-03-01 17:17:41 +0800
committer: GitHub <noreply@github.com> 2024-03-01 09:17:41 +0000
commit: 030041466b5b8453ca23e43a6385f4592f78a56c (patch)
tree: 7af9f2d34c349980881a2b9908a5ad8decce1616 /crates/typst-syntax
parent: 57ab6d09248ba036e7feb32f8b9527ec643f826c (diff)
6 files changed, 201 insertions, 83 deletions
diff --git a/crates/typst-syntax/src/ast.rs b/crates/typst-syntax/src/ast.rs
index 8f8eaac4..fc689a68 100644
--- a/crates/typst-syntax/src/ast.rs
+++ b/crates/typst-syntax/src/ast.rs
@@ -8,9 +8,7 @@ use std::ops::Deref;
 use ecow::EcoString;
 use unscanny::Scanner;
 
-use crate::{
-    is_id_continue, is_id_start, is_newline, split_newlines, Span, SyntaxKind, SyntaxNode,
-};
+use crate::{is_newline, Span, SyntaxKind, SyntaxNode};
 
 /// A typed AST node.
 pub trait AstNode<'a>: Sized {
@@ -558,87 +556,51 @@ node! {
 }
 
 impl<'a> Raw<'a> {
-    /// The trimmed raw text.
-    pub fn text(self) -> EcoString {
-        let mut text = self.0.text().as_str();
-        let blocky = text.starts_with("```");
-        text = text.trim_matches('`');
-
-        // Trim tag, one space at the start, and one space at the end if the
-        // last non-whitespace char is a backtick.
-        if blocky {
-            let mut s = Scanner::new(text);
-            if s.eat_if(is_id_start) {
-                s.eat_while(is_id_continue);
-            }
-            text = s.after();
-            text = text.strip_prefix(' ').unwrap_or(text);
-            if text.trim_end().ends_with('`') {
-                text = text.strip_suffix(' ').unwrap_or(text);
-            }
-        }
-
-        // Split into lines.
-        let mut lines = split_newlines(text);
-
-        if blocky {
-            let dedent = lines
-                .iter()
-                .skip(1)
-                .filter(|line| !line.chars().all(char::is_whitespace))
-                // The line with the closing ``` is always taken into account
-                .chain(lines.last())
-                .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
-                .min()
-                .unwrap_or(0);
-
-            // Dedent based on column, but not for the first line.
-            for line in lines.iter_mut().skip(1) {
-                let offset = line.chars().take(dedent).map(char::len_utf8).sum();
-                *line = &line[offset..];
-            }
-
-            let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
-
-            // Trims a sequence of whitespace followed by a newline at the start.
-            if lines.first().is_some_and(is_whitespace) {
-                lines.remove(0);
-            }
-
-            // Trims a newline followed by a sequence of whitespace at the end.
-            if lines.last().is_some_and(is_whitespace) {
-                lines.pop();
-            }
-        }
-
-        lines.join("\n").into()
+    /// The lines in the raw block.
+    pub fn lines(self) -> impl DoubleEndedIterator<Item = Text<'a>> {
+        self.0.children().filter_map(SyntaxNode::cast)
     }
 
     /// An optional identifier specifying the language to syntax-highlight in.
-    pub fn lang(self) -> Option<&'a str> {
-        let text = self.0.text();
-
+    pub fn lang(self) -> Option<RawLang<'a>> {
         // Only blocky literals are supposed to contain a language.
-        if !text.starts_with("```") {
+        let delim: RawDelim = self.0.cast_first_match()?;
+        if delim.0.len() < 3 {
             return Option::None;
         }
 
-        let inner = text.trim_start_matches('`');
-        let mut s = Scanner::new(inner);
-        s.eat_if(is_id_start).then(|| {
-            s.eat_while(is_id_continue);
-            s.before()
-        })
+        self.0.cast_first_match()
     }
 
     /// Whether the raw text should be displayed in a separate block.
     pub fn block(self) -> bool {
-        let text = self.0.text();
-        text.starts_with("```") && text.chars().any(is_newline)
+        self.0
+            .cast_first_match()
+            .is_some_and(|delim: RawDelim| delim.0.len() >= 3)
+            && self.0.children().any(|e| {
+                e.kind() == SyntaxKind::RawTrimmed && e.text().chars().any(is_newline)
+            })
     }
 }
 
 node! {
+    /// A language tag at the start of raw element: ``typ ``.
+    RawLang
+}
+
+impl<'a> RawLang<'a> {
+    /// Get the language tag.
+    pub fn get(self) -> &'a EcoString {
+        self.0.text()
+    }
+}
+
+node! {
+    /// A raw delimiter in single or 3+ backticks: `` ` ``.
+    RawDelim
+}
+
+node! {
     /// A hyperlink: `https://typst.org`.
     Link
 }
diff --git a/crates/typst-syntax/src/highlight.rs b/crates/typst-syntax/src/highlight.rs
index 19d35d0a..f1c8a298 100644
--- a/crates/typst-syntax/src/highlight.rs
+++ b/crates/typst-syntax/src/highlight.rs
@@ -153,6 +153,9 @@ pub fn highlight(node: &LinkedNode) -> Option<Tag> {
         SyntaxKind::Strong => Some(Tag::Strong),
         SyntaxKind::Emph => Some(Tag::Emph),
         SyntaxKind::Raw => Some(Tag::Raw),
+        SyntaxKind::RawLang => None,
+        SyntaxKind::RawTrimmed => None,
+        SyntaxKind::RawDelim => None,
         SyntaxKind::Link => Some(Tag::Link),
         SyntaxKind::Label => Some(Tag::Label),
         SyntaxKind::Ref => Some(Tag::Ref),
diff --git a/crates/typst-syntax/src/kind.rs b/crates/typst-syntax/src/kind.rs
index e5dd4e9b..c34f6002 100644
--- a/crates/typst-syntax/src/kind.rs
+++ b/crates/typst-syntax/src/kind.rs
@@ -28,6 +28,12 @@ pub enum SyntaxKind {
     Emph,
     /// Raw text with optional syntax highlighting: `` `...` ``.
     Raw,
+    /// A language tag at the start of raw text: ``typ ``.
+    RawLang,
+    /// A raw delimiter consisting of 1 or 3+ backticks: `` ` ``.
+    RawDelim,
+    /// A sequence of whitespace to ignore in a raw block: `    `.
+    RawTrimmed,
     /// A hyperlink: `https://typst.org`.
     Link,
     /// A label: `<intro>`.
@@ -369,6 +375,9 @@ impl SyntaxKind {
             Self::Strong => "strong content",
             Self::Emph => "emphasized content",
             Self::Raw => "raw block",
+            Self::RawLang => "raw language tag",
+            Self::RawTrimmed => "raw trimmed",
+            Self::RawDelim => "raw delimiter",
             Self::Link => "link",
             Self::Label => "label",
             Self::Ref => "reference",
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 300a8353..aacbee62 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -16,6 +16,8 @@ pub(super) struct Lexer<'s> {
     mode: LexMode,
     /// Whether the last token contained a newline.
     newline: bool,
+    /// The state held by raw line lexing.
+    raw: Vec<(SyntaxKind, usize)>,
     /// An error for the last token.
     error: Option<EcoString>,
 }
@@ -29,6 +31,8 @@ pub(super) enum LexMode {
     Math,
     /// Keywords, literals and operators.
     Code,
+    /// The contents of a raw block.
+    Raw,
 }
 
 impl<'s> Lexer<'s> {
@@ -40,6 +44,7 @@ impl<'s> Lexer<'s> {
             mode,
             newline: false,
             error: None,
+            raw: Vec::new(),
         }
     }
 
@@ -86,6 +91,14 @@ impl Lexer<'_> {
 /// Shared.
 impl Lexer<'_> {
     pub fn next(&mut self) -> SyntaxKind {
+        if self.mode == LexMode::Raw {
+            let Some((kind, end)) = self.raw.pop() else {
+                return SyntaxKind::Eof;
+            };
+            self.s.jump(end);
+            return kind;
+        }
+
         self.newline = false;
         self.error = None;
         let start = self.s.cursor();
@@ -101,6 +114,7 @@ impl Lexer<'_> {
                 LexMode::Markup => self.markup(start, c),
                 LexMode::Math => self.math(start, c),
                 LexMode::Code => self.code(start, c),
+                LexMode::Raw => unreachable!(),
             },
 
             None => SyntaxKind::Eof,
@@ -224,15 +238,23 @@ impl Lexer<'_> {
     }
 
     fn raw(&mut self) -> SyntaxKind {
+        let start = self.s.cursor() - 1;
+        self.raw.clear();
+
+        // Determine number of opening backticks.
         let mut backticks = 1;
         while self.s.eat_if('`') {
             backticks += 1;
         }
 
+        // Special case for ``.
         if backticks == 2 {
-            return SyntaxKind::Raw;
+            self.push_raw(SyntaxKind::RawDelim);
+            self.s.jump(start + 1);
+            return SyntaxKind::RawDelim;
         }
 
+        // Find end of raw text.
         let mut found = 0;
         while found < backticks {
             match self.s.eat() {
@@ -246,12 +268,99 @@ impl Lexer<'_> {
             return self.error("unclosed raw text");
         }
 
-        SyntaxKind::Raw
+        let end = self.s.cursor();
+        if backticks >= 3 {
+            self.blocky_raw(start, end, backticks);
+        } else {
+            // Single backtick needs no trimming or extra fancyness.
+            self.s.jump(end - backticks);
+            self.push_raw(SyntaxKind::Text);
+            self.s.jump(end);
+        }
+
+        // Closing delimiter.
+        self.push_raw(SyntaxKind::RawDelim);
+
+        // The saved tokens will be removed in reverse.
+        self.raw.reverse();
+
+        // Opening delimiter.
+        self.s.jump(start + backticks);
+        SyntaxKind::RawDelim
+    }
+
+    fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
+        // Language tag.
+        self.s.jump(start + backticks);
+        if self.s.eat_if(is_id_start) {
+            self.s.eat_while(is_id_continue);
+            self.push_raw(SyntaxKind::RawLang);
+        }
+
+        // Determine inner content between backticks and with trimmed
+        // single spaces (line trimming comes later).
+        self.s.eat_if(' ');
+        let mut inner = self.s.to(end - backticks);
+        if inner.trim_end().ends_with('`') {
+            inner = inner.strip_suffix(' ').unwrap_or(inner);
+        }
+
+        // Determine dedent level.
+        let lines = split_newlines(inner);
+        let dedent = lines
+            .iter()
+            .skip(1)
+            .filter(|line| !line.chars().all(char::is_whitespace))
+            // The line with the closing ``` is always taken into account
+            .chain(lines.last())
+            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
+            .min()
+            .unwrap_or(0);
+
+        let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
+        let starts_whitespace = lines.first().is_some_and(is_whitespace);
+        let ends_whitespace = lines.last().is_some_and(is_whitespace);
+
+        let mut lines = lines.into_iter();
+        let mut skipped = false;
+
+        // Trim whitespace + newline at start.
+        if starts_whitespace {
+            self.s.advance(lines.next().unwrap().len());
+            skipped = true;
+        }
+        // Trim whitespace + newline at end.
+        if ends_whitespace {
+            lines.next_back();
+        }
+
+        // Add lines.
+        for (i, line) in lines.enumerate() {
+            let dedent = if i == 0 && !skipped { 0 } else { dedent };
+            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
+            self.s.eat_newline();
+            self.s.advance(offset);
+            self.push_raw(SyntaxKind::RawTrimmed);
+            self.s.advance(line.len() - offset);
+            self.push_raw(SyntaxKind::Text);
+        }
+
+        // Add final trimmed.
+        if self.s.cursor() < end - backticks {
+            self.s.jump(end - backticks);
+            self.push_raw(SyntaxKind::RawTrimmed);
+        }
+        self.s.jump(end);
+    }
+
+    fn push_raw(&mut self, kind: SyntaxKind) {
+        let end = self.s.cursor();
+        self.raw.push((kind, end));
     }
 
     fn link(&mut self) -> SyntaxKind {
         let (link, balanced) = link_prefix(self.s.after());
-        self.s.jump(self.s.cursor() + link.len());
+        self.s.advance(link.len());
 
         if !balanced {
             return self.error(
@@ -632,6 +741,25 @@ fn keyword(ident: &str) -> Option<SyntaxKind> {
     })
 }
 
+trait ScannerExt {
+    fn advance(&mut self, by: usize);
+    fn eat_newline(&mut self) -> bool;
+}
+
+impl ScannerExt for Scanner<'_> {
+    fn advance(&mut self, by: usize) {
+        self.jump(self.cursor() + by);
+    }
+
+    fn eat_newline(&mut self) -> bool {
+        let ate = self.eat_if(is_newline);
+        if ate && self.before().ends_with('\r') {
+            self.eat_if('\n');
+        }
+        ate
+    }
+}
+
 /// Whether a character will become a Space token in Typst
 #[inline]
 fn is_space(character: char, mode: LexMode) -> bool {
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index f4bb19e1..4785b8a1 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -116,13 +116,13 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         | SyntaxKind::Escape
         | SyntaxKind::Shorthand
         | SyntaxKind::SmartQuote
-        | SyntaxKind::Raw
         | SyntaxKind::Link
         | SyntaxKind::Label => p.eat(),
 
         SyntaxKind::Hash => embedded_code_expr(p),
         SyntaxKind::Star => strong(p),
         SyntaxKind::Underscore => emph(p),
+        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::HeadingMarker if *at_start => heading(p),
         SyntaxKind::ListMarker if *at_start => list_item(p),
         SyntaxKind::EnumMarker if *at_start => enum_item(p),
@@ -172,6 +172,22 @@ fn emph(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Emph);
 }
 
+/// Parses raw text with optional syntax highlighting: `` `...` ``.
+fn raw(p: &mut Parser) {
+    let m = p.marker();
+    p.enter(LexMode::Raw);
+    p.assert(SyntaxKind::RawDelim);
+
+    // Eats until the closing delimiter.
+    while !p.eof() && !p.at(SyntaxKind::RawDelim) {
+        p.eat();
+    }
+
+    p.expect(SyntaxKind::RawDelim);
+    p.exit();
+    p.wrap(m, SyntaxKind::Raw);
+}
+
 /// Parses a section heading: `= Introduction`.
 fn heading(p: &mut Parser) {
     const END: SyntaxSet = SyntaxSet::new()
@@ -747,6 +763,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         SyntaxKind::LeftBrace => code_block(p),
         SyntaxKind::LeftBracket => content_block(p),
         SyntaxKind::LeftParen => expr_with_paren(p, atomic),
+        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::Dollar => equation(p),
         SyntaxKind::Let => let_binding(p),
         SyntaxKind::Set => set_rule(p),
@@ -768,8 +785,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         | SyntaxKind::Bool
         | SyntaxKind::Numeric
         | SyntaxKind::Str
-        | SyntaxKind::Label
-        | SyntaxKind::Raw => p.eat(),
+        | SyntaxKind::Label => p.eat(),
 
         _ => p.expected("expression"),
     }
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index 906d5fac..39e64651 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -15,7 +15,10 @@ impl SyntaxSet {
     }
 
     /// Insert a syntax kind into the set.
+    ///
+    /// You can only add kinds with discriminator < 128.
     pub const fn add(self, kind: SyntaxKind) -> Self {
+        assert!((kind as u8) < BITS);
         Self(self.0 | bit(kind))
     }
 
@@ -26,10 +29,12 @@ impl SyntaxSet {
 
     /// Whether the set contains the given syntax kind.
     pub const fn contains(&self, kind: SyntaxKind) -> bool {
-        (self.0 & bit(kind)) != 0
+        (kind as u8) < BITS && (self.0 & bit(kind)) != 0
     }
 }
 
+const BITS: u8 = 128;
+
 const fn bit(kind: SyntaxKind) -> u128 {
     1 << (kind as usize)
 }
@@ -54,7 +59,7 @@ pub const MARKUP_EXPR: SyntaxSet = SyntaxSet::new()
     .add(SyntaxKind::Escape)
     .add(SyntaxKind::Shorthand)
     .add(SyntaxKind::SmartQuote)
-    .add(SyntaxKind::Raw)
+    .add(SyntaxKind::RawDelim)
     .add(SyntaxKind::Link)
     .add(SyntaxKind::Label)
     .add(SyntaxKind::Hash)
@@ -119,7 +124,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = SyntaxSet::new()
     .add(SyntaxKind::Numeric)
     .add(SyntaxKind::Str)
     .add(SyntaxKind::Label)
-    .add(SyntaxKind::Raw);
+    .add(SyntaxKind::RawDelim);
 
 /// Syntax kinds that are unary operators.
 pub const UNARY_OP: SyntaxSet = SyntaxSet::new()
@@ -172,11 +177,6 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_size() {
-        assert!((SyntaxKind::Eof as usize) < 128);
-    }
-
-    #[test]
     fn test_set() {
         let set = SyntaxSet::new().add(SyntaxKind::And).add(SyntaxKind::Or);
         assert!(set.contains(SyntaxKind::And));
author	Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com>	2024-03-01 17:17:41 +0800
committer	GitHub <noreply@github.com>	2024-03-01 09:17:41 +0000
commit	030041466b5b8453ca23e43a6385f4592f78a56c (patch)
tree	7af9f2d34c349980881a2b9908a5ad8decce1616 /crates/typst-syntax
parent	57ab6d09248ba036e7feb32f8b9527ec643f826c (diff)