Provide more fine-grained spans in raw blocks (#3257)

Co-authored-by: Laurenz <laurmaedje@gmail.com>
author: Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com> 2024-03-01 17:17:41 +0800
committer: GitHub <noreply@github.com> 2024-03-01 09:17:41 +0000
commit: 030041466b5b8453ca23e43a6385f4592f78a56c (patch)
tree: 7af9f2d34c349980881a2b9908a5ad8decce1616
parent: 57ab6d09248ba036e7feb32f8b9527ec643f826c (diff)
13 files changed, 500 insertions, 124 deletions
diff --git a/crates/typst-syntax/src/ast.rs b/crates/typst-syntax/src/ast.rs
index 8f8eaac4..fc689a68 100644
--- a/crates/typst-syntax/src/ast.rs
+++ b/crates/typst-syntax/src/ast.rs
@@ -8,9 +8,7 @@ use std::ops::Deref;
 use ecow::EcoString;
 use unscanny::Scanner;
 
-use crate::{
-    is_id_continue, is_id_start, is_newline, split_newlines, Span, SyntaxKind, SyntaxNode,
-};
+use crate::{is_newline, Span, SyntaxKind, SyntaxNode};
 
 /// A typed AST node.
 pub trait AstNode<'a>: Sized {
@@ -558,87 +556,51 @@ node! {
 }
 
 impl<'a> Raw<'a> {
-    /// The trimmed raw text.
-    pub fn text(self) -> EcoString {
-        let mut text = self.0.text().as_str();
-        let blocky = text.starts_with("```");
-        text = text.trim_matches('`');
-
-        // Trim tag, one space at the start, and one space at the end if the
-        // last non-whitespace char is a backtick.
-        if blocky {
-            let mut s = Scanner::new(text);
-            if s.eat_if(is_id_start) {
-                s.eat_while(is_id_continue);
-            }
-            text = s.after();
-            text = text.strip_prefix(' ').unwrap_or(text);
-            if text.trim_end().ends_with('`') {
-                text = text.strip_suffix(' ').unwrap_or(text);
-            }
-        }
-
-        // Split into lines.
-        let mut lines = split_newlines(text);
-
-        if blocky {
-            let dedent = lines
-                .iter()
-                .skip(1)
-                .filter(|line| !line.chars().all(char::is_whitespace))
-                // The line with the closing ``` is always taken into account
-                .chain(lines.last())
-                .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
-                .min()
-                .unwrap_or(0);
-
-            // Dedent based on column, but not for the first line.
-            for line in lines.iter_mut().skip(1) {
-                let offset = line.chars().take(dedent).map(char::len_utf8).sum();
-                *line = &line[offset..];
-            }
-
-            let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
-
-            // Trims a sequence of whitespace followed by a newline at the start.
-            if lines.first().is_some_and(is_whitespace) {
-                lines.remove(0);
-            }
-
-            // Trims a newline followed by a sequence of whitespace at the end.
-            if lines.last().is_some_and(is_whitespace) {
-                lines.pop();
-            }
-        }
-
-        lines.join("\n").into()
+    /// The lines in the raw block.
+    pub fn lines(self) -> impl DoubleEndedIterator<Item = Text<'a>> {
+        self.0.children().filter_map(SyntaxNode::cast)
     }
 
     /// An optional identifier specifying the language to syntax-highlight in.
-    pub fn lang(self) -> Option<&'a str> {
-        let text = self.0.text();
-
+    pub fn lang(self) -> Option<RawLang<'a>> {
         // Only blocky literals are supposed to contain a language.
-        if !text.starts_with("```") {
+        let delim: RawDelim = self.0.cast_first_match()?;
+        if delim.0.len() < 3 {
             return Option::None;
         }
 
-        let inner = text.trim_start_matches('`');
-        let mut s = Scanner::new(inner);
-        s.eat_if(is_id_start).then(|| {
-            s.eat_while(is_id_continue);
-            s.before()
-        })
+        self.0.cast_first_match()
     }
 
     /// Whether the raw text should be displayed in a separate block.
     pub fn block(self) -> bool {
-        let text = self.0.text();
-        text.starts_with("```") && text.chars().any(is_newline)
+        self.0
+            .cast_first_match()
+            .is_some_and(|delim: RawDelim| delim.0.len() >= 3)
+            && self.0.children().any(|e| {
+                e.kind() == SyntaxKind::RawTrimmed && e.text().chars().any(is_newline)
+            })
     }
 }
 
 node! {
+    /// A language tag at the start of raw element: ``typ ``.
+    RawLang
+}
+
+impl<'a> RawLang<'a> {
+    /// Get the language tag.
+    pub fn get(self) -> &'a EcoString {
+        self.0.text()
+    }
+}
+
+node! {
+    /// A raw delimiter in single or 3+ backticks: `` ` ``.
+    RawDelim
+}
+
+node! {
     /// A hyperlink: `https://typst.org`.
     Link
 }
diff --git a/crates/typst-syntax/src/highlight.rs b/crates/typst-syntax/src/highlight.rs
index 19d35d0a..f1c8a298 100644
--- a/crates/typst-syntax/src/highlight.rs
+++ b/crates/typst-syntax/src/highlight.rs
@@ -153,6 +153,9 @@ pub fn highlight(node: &LinkedNode) -> Option<Tag> {
         SyntaxKind::Strong => Some(Tag::Strong),
         SyntaxKind::Emph => Some(Tag::Emph),
         SyntaxKind::Raw => Some(Tag::Raw),
+        SyntaxKind::RawLang => None,
+        SyntaxKind::RawTrimmed => None,
+        SyntaxKind::RawDelim => None,
         SyntaxKind::Link => Some(Tag::Link),
         SyntaxKind::Label => Some(Tag::Label),
         SyntaxKind::Ref => Some(Tag::Ref),
diff --git a/crates/typst-syntax/src/kind.rs b/crates/typst-syntax/src/kind.rs
index e5dd4e9b..c34f6002 100644
--- a/crates/typst-syntax/src/kind.rs
+++ b/crates/typst-syntax/src/kind.rs
@@ -28,6 +28,12 @@ pub enum SyntaxKind {
     Emph,
     /// Raw text with optional syntax highlighting: `` `...` ``.
     Raw,
+    /// A language tag at the start of raw text: ``typ ``.
+    RawLang,
+    /// A raw delimiter consisting of 1 or 3+ backticks: `` ` ``.
+    RawDelim,
+    /// A sequence of whitespace to ignore in a raw block: `    `.
+    RawTrimmed,
     /// A hyperlink: `https://typst.org`.
     Link,
     /// A label: `<intro>`.
@@ -369,6 +375,9 @@ impl SyntaxKind {
             Self::Strong => "strong content",
             Self::Emph => "emphasized content",
             Self::Raw => "raw block",
+            Self::RawLang => "raw language tag",
+            Self::RawTrimmed => "raw trimmed",
+            Self::RawDelim => "raw delimiter",
             Self::Link => "link",
             Self::Label => "label",
             Self::Ref => "reference",
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
index 300a8353..aacbee62 100644
--- a/crates/typst-syntax/src/lexer.rs
+++ b/crates/typst-syntax/src/lexer.rs
@@ -16,6 +16,8 @@ pub(super) struct Lexer<'s> {
     mode: LexMode,
     /// Whether the last token contained a newline.
     newline: bool,
+    /// The state held by raw line lexing.
+    raw: Vec<(SyntaxKind, usize)>,
     /// An error for the last token.
     error: Option<EcoString>,
 }
@@ -29,6 +31,8 @@ pub(super) enum LexMode {
     Math,
     /// Keywords, literals and operators.
     Code,
+    /// The contents of a raw block.
+    Raw,
 }
 
 impl<'s> Lexer<'s> {
@@ -40,6 +44,7 @@ impl<'s> Lexer<'s> {
             mode,
             newline: false,
             error: None,
+            raw: Vec::new(),
         }
     }
 
@@ -86,6 +91,14 @@ impl Lexer<'_> {
 /// Shared.
 impl Lexer<'_> {
     pub fn next(&mut self) -> SyntaxKind {
+        if self.mode == LexMode::Raw {
+            let Some((kind, end)) = self.raw.pop() else {
+                return SyntaxKind::Eof;
+            };
+            self.s.jump(end);
+            return kind;
+        }
+
         self.newline = false;
         self.error = None;
         let start = self.s.cursor();
@@ -101,6 +114,7 @@ impl Lexer<'_> {
                 LexMode::Markup => self.markup(start, c),
                 LexMode::Math => self.math(start, c),
                 LexMode::Code => self.code(start, c),
+                LexMode::Raw => unreachable!(),
             },
 
             None => SyntaxKind::Eof,
@@ -224,15 +238,23 @@ impl Lexer<'_> {
     }
 
     fn raw(&mut self) -> SyntaxKind {
+        let start = self.s.cursor() - 1;
+        self.raw.clear();
+
+        // Determine number of opening backticks.
         let mut backticks = 1;
         while self.s.eat_if('`') {
             backticks += 1;
         }
 
+        // Special case for ``.
         if backticks == 2 {
-            return SyntaxKind::Raw;
+            self.push_raw(SyntaxKind::RawDelim);
+            self.s.jump(start + 1);
+            return SyntaxKind::RawDelim;
         }
 
+        // Find end of raw text.
         let mut found = 0;
         while found < backticks {
             match self.s.eat() {
@@ -246,12 +268,99 @@ impl Lexer<'_> {
             return self.error("unclosed raw text");
         }
 
-        SyntaxKind::Raw
+        let end = self.s.cursor();
+        if backticks >= 3 {
+            self.blocky_raw(start, end, backticks);
+        } else {
+            // Single backtick needs no trimming or extra fancyness.
+            self.s.jump(end - backticks);
+            self.push_raw(SyntaxKind::Text);
+            self.s.jump(end);
+        }
+
+        // Closing delimiter.
+        self.push_raw(SyntaxKind::RawDelim);
+
+        // The saved tokens will be removed in reverse.
+        self.raw.reverse();
+
+        // Opening delimiter.
+        self.s.jump(start + backticks);
+        SyntaxKind::RawDelim
+    }
+
+    fn blocky_raw(&mut self, start: usize, end: usize, backticks: usize) {
+        // Language tag.
+        self.s.jump(start + backticks);
+        if self.s.eat_if(is_id_start) {
+            self.s.eat_while(is_id_continue);
+            self.push_raw(SyntaxKind::RawLang);
+        }
+
+        // Determine inner content between backticks and with trimmed
+        // single spaces (line trimming comes later).
+        self.s.eat_if(' ');
+        let mut inner = self.s.to(end - backticks);
+        if inner.trim_end().ends_with('`') {
+            inner = inner.strip_suffix(' ').unwrap_or(inner);
+        }
+
+        // Determine dedent level.
+        let lines = split_newlines(inner);
+        let dedent = lines
+            .iter()
+            .skip(1)
+            .filter(|line| !line.chars().all(char::is_whitespace))
+            // The line with the closing ``` is always taken into account
+            .chain(lines.last())
+            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
+            .min()
+            .unwrap_or(0);
+
+        let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
+        let starts_whitespace = lines.first().is_some_and(is_whitespace);
+        let ends_whitespace = lines.last().is_some_and(is_whitespace);
+
+        let mut lines = lines.into_iter();
+        let mut skipped = false;
+
+        // Trim whitespace + newline at start.
+        if starts_whitespace {
+            self.s.advance(lines.next().unwrap().len());
+            skipped = true;
+        }
+        // Trim whitespace + newline at end.
+        if ends_whitespace {
+            lines.next_back();
+        }
+
+        // Add lines.
+        for (i, line) in lines.enumerate() {
+            let dedent = if i == 0 && !skipped { 0 } else { dedent };
+            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
+            self.s.eat_newline();
+            self.s.advance(offset);
+            self.push_raw(SyntaxKind::RawTrimmed);
+            self.s.advance(line.len() - offset);
+            self.push_raw(SyntaxKind::Text);
+        }
+
+        // Add final trimmed.
+        if self.s.cursor() < end - backticks {
+            self.s.jump(end - backticks);
+            self.push_raw(SyntaxKind::RawTrimmed);
+        }
+        self.s.jump(end);
+    }
+
+    fn push_raw(&mut self, kind: SyntaxKind) {
+        let end = self.s.cursor();
+        self.raw.push((kind, end));
     }
 
     fn link(&mut self) -> SyntaxKind {
         let (link, balanced) = link_prefix(self.s.after());
-        self.s.jump(self.s.cursor() + link.len());
+        self.s.advance(link.len());
 
         if !balanced {
             return self.error(
@@ -632,6 +741,25 @@ fn keyword(ident: &str) -> Option<SyntaxKind> {
     })
 }
 
+trait ScannerExt {
+    fn advance(&mut self, by: usize);
+    fn eat_newline(&mut self) -> bool;
+}
+
+impl ScannerExt for Scanner<'_> {
+    fn advance(&mut self, by: usize) {
+        self.jump(self.cursor() + by);
+    }
+
+    fn eat_newline(&mut self) -> bool {
+        let ate = self.eat_if(is_newline);
+        if ate && self.before().ends_with('\r') {
+            self.eat_if('\n');
+        }
+        ate
+    }
+}
+
 /// Whether a character will become a Space token in Typst
 #[inline]
 fn is_space(character: char, mode: LexMode) -> bool {
diff --git a/crates/typst-syntax/src/parser.rs b/crates/typst-syntax/src/parser.rs
index f4bb19e1..4785b8a1 100644
--- a/crates/typst-syntax/src/parser.rs
+++ b/crates/typst-syntax/src/parser.rs
@@ -116,13 +116,13 @@ fn markup_expr(p: &mut Parser, at_start: &mut bool) {
         | SyntaxKind::Escape
         | SyntaxKind::Shorthand
         | SyntaxKind::SmartQuote
-        | SyntaxKind::Raw
         | SyntaxKind::Link
         | SyntaxKind::Label => p.eat(),
 
         SyntaxKind::Hash => embedded_code_expr(p),
         SyntaxKind::Star => strong(p),
         SyntaxKind::Underscore => emph(p),
+        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::HeadingMarker if *at_start => heading(p),
         SyntaxKind::ListMarker if *at_start => list_item(p),
         SyntaxKind::EnumMarker if *at_start => enum_item(p),
@@ -172,6 +172,22 @@ fn emph(p: &mut Parser) {
     p.wrap(m, SyntaxKind::Emph);
 }
 
+/// Parses raw text with optional syntax highlighting: `` `...` ``.
+fn raw(p: &mut Parser) {
+    let m = p.marker();
+    p.enter(LexMode::Raw);
+    p.assert(SyntaxKind::RawDelim);
+
+    // Eats until the closing delimiter.
+    while !p.eof() && !p.at(SyntaxKind::RawDelim) {
+        p.eat();
+    }
+
+    p.expect(SyntaxKind::RawDelim);
+    p.exit();
+    p.wrap(m, SyntaxKind::Raw);
+}
+
 /// Parses a section heading: `= Introduction`.
 fn heading(p: &mut Parser) {
     const END: SyntaxSet = SyntaxSet::new()
@@ -747,6 +763,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         SyntaxKind::LeftBrace => code_block(p),
         SyntaxKind::LeftBracket => content_block(p),
         SyntaxKind::LeftParen => expr_with_paren(p, atomic),
+        SyntaxKind::RawDelim => raw(p),
         SyntaxKind::Dollar => equation(p),
         SyntaxKind::Let => let_binding(p),
         SyntaxKind::Set => set_rule(p),
@@ -768,8 +785,7 @@ fn code_primary(p: &mut Parser, atomic: bool) {
         | SyntaxKind::Bool
         | SyntaxKind::Numeric
         | SyntaxKind::Str
-        | SyntaxKind::Label
-        | SyntaxKind::Raw => p.eat(),
+        | SyntaxKind::Label => p.eat(),
 
         _ => p.expected("expression"),
     }
diff --git a/crates/typst-syntax/src/set.rs b/crates/typst-syntax/src/set.rs
index 906d5fac..39e64651 100644
--- a/crates/typst-syntax/src/set.rs
+++ b/crates/typst-syntax/src/set.rs
@@ -15,7 +15,10 @@ impl SyntaxSet {
     }
 
     /// Insert a syntax kind into the set.
+    ///
+    /// You can only add kinds with discriminator < 128.
     pub const fn add(self, kind: SyntaxKind) -> Self {
+        assert!((kind as u8) < BITS);
         Self(self.0 | bit(kind))
     }
 
@@ -26,10 +29,12 @@ impl SyntaxSet {
 
     /// Whether the set contains the given syntax kind.
     pub const fn contains(&self, kind: SyntaxKind) -> bool {
-        (self.0 & bit(kind)) != 0
+        (kind as u8) < BITS && (self.0 & bit(kind)) != 0
     }
 }
 
+const BITS: u8 = 128;
+
 const fn bit(kind: SyntaxKind) -> u128 {
     1 << (kind as usize)
 }
@@ -54,7 +59,7 @@ pub const MARKUP_EXPR: SyntaxSet = SyntaxSet::new()
     .add(SyntaxKind::Escape)
     .add(SyntaxKind::Shorthand)
     .add(SyntaxKind::SmartQuote)
-    .add(SyntaxKind::Raw)
+    .add(SyntaxKind::RawDelim)
     .add(SyntaxKind::Link)
     .add(SyntaxKind::Label)
     .add(SyntaxKind::Hash)
@@ -119,7 +124,7 @@ pub const ATOMIC_CODE_PRIMARY: SyntaxSet = SyntaxSet::new()
     .add(SyntaxKind::Numeric)
     .add(SyntaxKind::Str)
     .add(SyntaxKind::Label)
-    .add(SyntaxKind::Raw);
+    .add(SyntaxKind::RawDelim);
 
 /// Syntax kinds that are unary operators.
 pub const UNARY_OP: SyntaxSet = SyntaxSet::new()
@@ -172,11 +177,6 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_size() {
-        assert!((SyntaxKind::Eof as usize) < 128);
-    }
-
-    #[test]
     fn test_set() {
         let set = SyntaxSet::new().add(SyntaxKind::And).add(SyntaxKind::Or);
         assert!(set.contains(SyntaxKind::And));
diff --git a/crates/typst/src/eval/markup.rs b/crates/typst/src/eval/markup.rs
index 1bb12d49..d43e4495 100644
--- a/crates/typst/src/eval/markup.rs
+++ b/crates/typst/src/eval/markup.rs
@@ -8,7 +8,9 @@ use crate::model::{
 };
 use crate::symbols::Symbol;
 use crate::syntax::ast::{self, AstNode};
-use crate::text::{LinebreakElem, RawElem, SmartQuoteElem, SpaceElem, TextElem};
+use crate::text::{
+    LinebreakElem, RawContent, RawElem, SmartQuoteElem, SpaceElem, TextElem,
+};
 
 impl Eval for ast::Markup<'_> {
     type Output = Content;
@@ -165,9 +167,10 @@ impl Eval for ast::Raw<'_> {
     type Output = Content;
 
     fn eval(self, _: &mut Vm) -> SourceResult<Self::Output> {
-        let mut elem = RawElem::new(self.text()).with_block(self.block());
+        let lines = self.lines().map(|line| (line.get().clone(), line.span())).collect();
+        let mut elem = RawElem::new(RawContent::Lines(lines)).with_block(self.block());
         if let Some(lang) = self.lang() {
-            elem.push_lang(Some(lang.into()));
+            elem.push_lang(Some(lang.get().clone()));
         }
         Ok(elem.pack())
     }
diff --git a/crates/typst/src/foundations/value.rs b/crates/typst/src/foundations/value.rs
index b5f143d2..f661228a 100644
--- a/crates/typst/src/foundations/value.rs
+++ b/crates/typst/src/foundations/value.rs
@@ -19,7 +19,7 @@ use crate::foundations::{
 use crate::layout::{Abs, Angle, Em, Fr, Length, Ratio, Rel};
 use crate::symbols::Symbol;
 use crate::syntax::{ast, Span};
-use crate::text::{RawElem, TextElem};
+use crate::text::{RawContent, RawElem, TextElem};
 use crate::util::ArcExt;
 use crate::visualize::{Color, Gradient, Pattern};
 
@@ -209,7 +209,7 @@ impl Value {
             Self::Symbol(v) => TextElem::packed(v.get()),
             Self::Content(v) => v,
             Self::Module(module) => module.content(),
-            _ => RawElem::new(self.repr())
+            _ => RawElem::new(RawContent::Text(self.repr()))
                 .with_lang(Some("typc".into()))
                 .with_block(false)
                 .pack(),
diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs
index 2802bbcb..6add4310 100644
--- a/crates/typst/src/layout/inline/mod.rs
+++ b/crates/typst/src/layout/inline/mod.rs
@@ -287,7 +287,7 @@ impl SpanMapper {
     fn span_at(&self, offset: usize) -> (Span, u16) {
         let mut cursor = 0;
         for &(len, span) in &self.0 {
-            if (cursor..=cursor + len).contains(&offset) {
+            if (cursor..cursor + len).contains(&offset) {
                 return (span, u16::try_from(offset - cursor).unwrap_or(0));
             }
             cursor += len;
diff --git a/crates/typst/src/layout/inline/shaping.rs b/crates/typst/src/layout/inline/shaping.rs
index b558d5ad..f914d347 100644
--- a/crates/typst/src/layout/inline/shaping.rs
+++ b/crates/typst/src/layout/inline/shaping.rs
@@ -231,6 +231,7 @@ impl<'a> ShapedText<'a> {
         let decos = TextElem::deco_in(self.styles);
         let fill = TextElem::fill_in(self.styles);
         let stroke = TextElem::stroke_in(self.styles);
+        let span_offset = TextElem::span_offset_in(self.styles);
 
         for ((font, y_offset), group) in
             self.glyphs.as_ref().group_by_key(|g| (g.font.clone(), g.y_offset))
@@ -267,6 +268,12 @@ impl<'a> ShapedText<'a> {
                     frame.size_mut().x += justification_left.at(self.size)
                         + justification_right.at(self.size);
 
+                    // We may not be able to reach the offset completely if
+                    // it exceeds u16, but better to have a roughly correct
+                    // span offset than nothing.
+                    let mut span = shaped.span;
+                    span.1 = span.1.saturating_add(span_offset.saturating_as());
+
                     // |<---- a Glyph ---->|
                     //  -->|ShapedGlyph|<--
                     // +---+-----------+---+
@@ -293,7 +300,7 @@ impl<'a> ShapedText<'a> {
                         x_offset: shaped.x_offset + justification_left,
                         range: (shaped.range.start - range.start).saturating_as()
                             ..(shaped.range.end - range.start).saturating_as(),
-                        span: shaped.span,
+                        span,
                     }
                 })
                 .collect();
diff --git a/crates/typst/src/text/mod.rs b/crates/typst/src/text/mod.rs
index 13049b12..13193fe8 100644
--- a/crates/typst/src/text/mod.rs
+++ b/crates/typst/src/text/mod.rs
@@ -622,6 +622,12 @@ pub struct TextElem {
     #[required]
     pub text: EcoString,
 
+    /// The offset of the text in the text syntax node referenced by this
+    /// element's span.
+    #[internal]
+    #[ghost]
+    pub span_offset: usize,
+
     /// A delta to apply on the font weight.
     #[internal]
     #[fold]
diff --git a/crates/typst/src/text/raw.rs b/crates/typst/src/text/raw.rs
index c71b16b1..d47cd947 100644
--- a/crates/typst/src/text/raw.rs
+++ b/crates/typst/src/text/raw.rs
@@ -17,7 +17,7 @@ use crate::foundations::{
 };
 use crate::layout::{BlockElem, Em, HAlignment};
 use crate::model::Figurable;
-use crate::syntax::{split_newlines, LinkedNode, Spanned};
+use crate::syntax::{split_newlines, LinkedNode, Span, Spanned};
 use crate::text::{
     FontFamily, FontList, Hyphenate, Lang, LinebreakElem, LocalName, Region,
     SmartQuoteElem, TextElem, TextSize,
@@ -27,8 +27,9 @@ use crate::visualize::Color;
 use crate::{syntax, World};
 
 // Shorthand for highlighter closures.
-type StyleFn<'a> = &'a mut dyn FnMut(&LinkedNode, Range<usize>, synt::Style) -> Content;
-type LineFn<'a> = &'a mut dyn FnMut(i64, Range<usize>, &mut Vec<Content>);
+type StyleFn<'a> =
+    &'a mut dyn FnMut(usize, &LinkedNode, Range<usize>, synt::Style) -> Content;
+type LineFn<'a> = &'a mut dyn FnMut(usize, Range<usize>, &mut Vec<Content>);
 
 /// Raw text with optional syntax highlighting.
 ///
@@ -101,7 +102,7 @@ pub struct RawElem {
     /// ```
     /// ````
     #[required]
-    pub text: EcoString,
+    pub text: RawContent,
 
     /// Whether the raw text is displayed as a separate block.
     ///
@@ -300,17 +301,24 @@ impl Packed<RawElem> {
     #[comemo::memoize]
     fn highlight(&self, styles: StyleChain) -> Vec<Packed<RawLine>> {
         let elem = self.as_ref();
-        let span = self.span();
 
-        let mut text = elem.text().clone();
-        if text.contains('\t') {
-            let tab_size = RawElem::tab_size_in(styles);
-            text = align_tabs(&text, tab_size);
-        }
+        let text = elem.text();
+        let lines = match text {
+            RawContent::Lines(lines) if !lines.iter().any(|(s, _)| s.contains('\t')) => {
+                lines.clone()
+            }
+            _ => {
+                let mut text = text.get();
+                if text.contains('\t') {
+                    let tab_size = RawElem::tab_size_in(styles);
+                    text = align_tabs(&text, tab_size);
+                }
+                let lines = split_newlines(&text);
+                lines.into_iter().map(|line| (line.into(), self.span())).collect()
+            }
+        };
 
-        let lines = split_newlines(&text);
         let count = lines.len() as i64;
-
         let lang = elem
             .lang(styles)
             .as_ref()
@@ -332,6 +340,7 @@ impl Packed<RawElem> {
 
         let mut seq = vec![];
         if matches!(lang.as_deref(), Some("typ" | "typst" | "typc")) {
+            let text = text.get();
             let root = match lang.as_deref() {
                 Some("typc") => syntax::parse_code(&text),
                 _ => syntax::parse(&text),
@@ -341,16 +350,23 @@ impl Packed<RawElem> {
                 &text,
                 LinkedNode::new(&root),
                 synt::Highlighter::new(theme),
-                &mut |_, range, style| styled(&text[range], foreground, style),
+                &mut |i, _, range, style| {
+                    // Find start of line.
+                    // Note: Dedent is already applied to the text
+                    let span_offset = text[..range.start]
+                        .rfind('\n')
+                        .map_or(0, |i| range.start - (i + 1));
+                    styled(&text[range], foreground, style, lines[i].1, span_offset)
+                },
                 &mut |i, range, line| {
                     seq.push(
                         Packed::new(RawLine::new(
-                            i + 1,
+                            (i + 1) as i64,
                             count,
                             EcoString::from(&text[range]),
                             Content::sequence(line.drain(..)),
                         ))
-                        .spanned(span),
+                        .spanned(lines[i].1),
                     );
                 },
             )
@@ -366,33 +382,43 @@ impl Packed<RawElem> {
                 })
         }) {
             let mut highlighter = syntect::easy::HighlightLines::new(syntax, theme);
-            for (i, line) in lines.into_iter().enumerate() {
+            for (i, (line, line_span)) in lines.into_iter().enumerate() {
                 let mut line_content = vec![];
-                for (style, piece) in
-                    highlighter.highlight_line(line, syntax_set).into_iter().flatten()
+                let mut span_offset = 0;
+                for (style, piece) in highlighter
+                    .highlight_line(line.as_str(), syntax_set)
+                    .into_iter()
+                    .flatten()
                 {
-                    line_content.push(styled(piece, foreground, style));
+                    line_content.push(styled(
+                        piece,
+                        foreground,
+                        style,
+                        line_span,
+                        span_offset,
+                    ));
+                    span_offset += piece.len();
                 }
 
                 seq.push(
                     Packed::new(RawLine::new(
                         i as i64 + 1,
                         count,
-                        EcoString::from(line),
+                        line,
                         Content::sequence(line_content),
                     ))
-                    .spanned(span),
+                    .spanned(line_span),
                 );
             }
         } else {
-            seq.extend(lines.into_iter().enumerate().map(|(i, line)| {
+            seq.extend(lines.into_iter().enumerate().map(|(i, (line, line_span))| {
                 Packed::new(RawLine::new(
                     i as i64 + 1,
                     count,
-                    EcoString::from(line),
-                    TextElem::packed(line),
+                    line.clone(),
+                    TextElem::packed(line).spanned(line_span),
                 ))
-                .spanned(span)
+                .spanned(line_span)
             }));
         };
 
@@ -478,10 +504,42 @@ impl Figurable for Packed<RawElem> {}
 
 impl PlainText for Packed<RawElem> {
     fn plain_text(&self, text: &mut EcoString) {
-        text.push_str(self.text());
+        text.push_str(&self.text().get());
+    }
+}
+
+/// The content of the raw text.
+#[derive(Debug, Clone, Hash, PartialEq)]
+pub enum RawContent {
+    /// From a string.
+    Text(EcoString),
+    /// From lines of text.
+    Lines(EcoVec<(EcoString, Span)>),
+}
+
+impl RawContent {
+    /// Returns or synthesizes the text content of the raw text.
+    fn get(&self) -> EcoString {
+        match self.clone() {
+            RawContent::Text(text) => text,
+            RawContent::Lines(lines) => {
+                let mut lines = lines.into_iter().map(|(s, _)| s);
+                if lines.len() <= 1 {
+                    lines.next().unwrap_or_default()
+                } else {
+                    lines.collect::<Vec<_>>().join("\n").into()
+                }
+            }
+        }
     }
 }
 
+cast! {
+    RawContent,
+    self => self.get().into_value(),
+    v: EcoString => Self::Text(v),
+}
+
 /// A highlighted line of raw text.
 ///
 /// This is a helper element that is synthesized by [`raw`]($raw) elements.
@@ -536,7 +594,7 @@ struct ThemedHighlighter<'a> {
     /// The range of the current line.
     range: Range<usize>,
     /// The current line number.
-    line: i64,
+    line: usize,
     /// The function to style a piece of text.
     style_fn: StyleFn<'a>,
     /// The function to append a line.
@@ -597,8 +655,12 @@ impl<'a> ThemedHighlighter<'a> {
 
                 let offset = self.node.range().start + len;
                 let token_range = offset..(offset + line.len());
-                self.current_line
-                    .push((self.style_fn)(&self.node, token_range, style));
+                self.current_line.push((self.style_fn)(
+                    self.line,
+                    &self.node,
+                    token_range,
+                    style,
+                ));
 
                 len += line.len() + 1;
             }
@@ -621,23 +683,33 @@ impl<'a> ThemedHighlighter<'a> {
 }
 
 /// Style a piece of text with a syntect style.
-fn styled(piece: &str, foreground: synt::Color, style: synt::Style) -> Content {
-    let mut body = TextElem::packed(piece);
+fn styled(
+    piece: &str,
+    foreground: synt::Color,
+    style: synt::Style,
+    span: Span,
+    span_offset: usize,
+) -> Content {
+    let mut body = TextElem::packed(piece).spanned(span);
+
+    if span_offset > 0 {
+        body = body.styled(TextElem::set_span_offset(span_offset));
+    }
 
     if style.foreground != foreground {
         body = body.styled(TextElem::set_fill(to_typst(style.foreground).into()));
     }
 
     if style.font_style.contains(synt::FontStyle::BOLD) {
-        body = body.strong();
+        body = body.strong().spanned(span);
     }
 
     if style.font_style.contains(synt::FontStyle::ITALIC) {
-        body = body.emph();
+        body = body.emph().spanned(span);
     }
 
     if style.font_style.contains(synt::FontStyle::UNDERLINE) {
-        body = body.underlined();
+        body = body.underlined().spanned(span);
     }
 
     body
diff --git a/tests/typ/compiler/raw.typ b/tests/typ/compiler/raw.typ
new file mode 100644
index 00000000..3084146d
--- /dev/null
+++ b/tests/typ/compiler/raw.typ
@@ -0,0 +1,170 @@
+// Test new raw parser
+// Ref: false
+
+---
+#let empty = (
+  name: "empty",
+  input: ``,
+  text: "",
+)
+
+#let backtick = (
+  name: "backtick",
+  input: ``` ` ```,
+  text: "`",
+  block: false,
+)
+
+#let lang-backtick = (
+  name: "lang-backtick",
+  input: ```js ` ```,
+  lang: "js",
+  text: "`",
+  block: false,
+)
+
+// The language tag stops on space
+#let lang-space = (
+  name: "lang-space",
+  input: ```js test ```,
+  lang: "js",
+  text: "test ",
+  block: false,
+)
+
+// The language tag stops on newline
+#let lang-newline = (
+  name: "lang-newline",
+  input: ```js
+test
+```,
+  lang: "js",
+  text: "test",
+  block: true,
+)
+
+// The first line and the last line are ignored
+#let blocky = (
+  name: "blocky",
+  input: {
+```
+test
+```
+},
+  text: "test",
+  block: true,
+)
+
+// A blocky raw should handle dedents
+#let blocky-dedent = (
+  name: "blocky-dedent",
+  input: {
+```
+ test
+ ```
+  },
+  text: "test",
+  block: true,
+)
+
+// When there is content in the first line, it should exactly eat a whitespace char.
+#let blocky-dedent-firstline = (
+  name: "blocky-dedent-firstline",
+  input: ``` test
+  ```,
+  text: "test",
+  block: true,
+)
+
+// When there is content in the first line, it should exactly eat a whitespace char.
+#let blocky-dedent-firstline2 = (
+  name: "blocky-dedent-firstline2",
+  input: ``` test
+```,
+  text: "test",
+  block: true,
+)
+
+// The first line is not affected by dedent, and the middle lines don't consider the whitespace prefix of the first line.
+#let blocky-dedent-firstline3 = (
+  name: "blocky-dedent-firstline3",
+  input: ``` test
+     test2
+  ```,
+  text: "test\n   test2",
+  block: true,
+)
+
+// The first line is not affected by dedent, and the middle lines don't consider the whitespace prefix of the first line.
+#let blocky-dedent-firstline4 = (
+  name: "blocky-dedent-firstline4",
+  input: ```     test
+  test2
+  ```,
+  text: "    test\ntest2",
+  block: true,
+)
+
+#let blocky-dedent-lastline = (
+  name: "blocky-dedent-lastline",
+  input: ```
+  test
+ ```,
+  text: " test",
+  block: true,
+)
+
+#let blocky-dedent-lastline2 = (
+  name: "blocky-dedent-lastline2",
+  input: ```
+  test
+   ```,
+  text: "test",
+  block: true,
+)
+
+#let blocky-tab = (
+  name: "blocky-tab",
+  input: {
+```
+	test
+```
+},
+  text: "\ttest",
+  block: true,
+)
+
+#let blocky-tab-dedent = (
+  name: "blocky-tab-dedent",
+  input: {
+```
+	test
+  
+ ```
+},
+  text: "test\n ",
+  block: true,
+)
+
+#let cases = (
+  empty,
+  backtick,
+  lang-backtick,
+  lang-space,
+  lang-newline,
+  blocky,
+  blocky-dedent,
+  blocky-dedent-firstline,
+  blocky-dedent-firstline2,
+  blocky-dedent-firstline3,
+  blocky-dedent-lastline,
+  blocky-dedent-lastline2,
+  blocky-tab,
+  blocky-tab-dedent,
+)
+
+#for c in cases {
+  assert.eq(c.text, c.input.text, message: "in point " + c.name + ", expect " + repr(c.text) + ", got " + repr(c.input.text) + "")
+  let block = c.at("block", default: false)
+  assert.eq(block, c.input.block, message: "in point " + c.name + ", expect " + repr(block) + ", got " + repr(c.input.block) + "")
+}
author	Myriad-Dreamin <35292584+Myriad-Dreamin@users.noreply.github.com>	2024-03-01 17:17:41 +0800
committer	GitHub <noreply@github.com>	2024-03-01 09:17:41 +0000
commit	030041466b5b8453ca23e43a6385f4592f78a56c (patch)
tree	7af9f2d34c349980881a2b9908a5ad8decce1616
parent	57ab6d09248ba036e7feb32f8b9527ec643f826c (diff)