Refactor raw blocks 💱

author: Laurenz <laurmaedje@gmail.com> 2020-09-30 18:59:33 +0200
committer: Laurenz <laurmaedje@gmail.com> 2020-09-30 18:59:33 +0200
commit: 4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch)
tree: 70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse
parent: 7cc279f7ae122f4c40592004dde89792c636b3c8 (diff)
4 files changed, 173 insertions, 241 deletions
diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs
index 55b1fe67..a2ff963b 100644
--- a/src/parse/escaping.rs
+++ b/src/parse/escaping.rs
@@ -1,4 +1,5 @@
 use super::is_newline_char;
+use crate::syntax::{Ident, Raw};
 
 /// Resolves all escape sequences in a string.
 pub fn unescape_string(string: &str) -> String {
@@ -56,101 +57,60 @@ pub fn unescape_string(string: &str) -> String {
     out
 }
 
-/// Resolves all escape sequences in raw markup (between backticks) and splits it into
-/// into lines.
-pub fn unescape_raw(raw: &str) -> Vec<String> {
+/// Resolves the language tag and trims the raw text.
+///
+/// Returns:
+/// - The language tag
+/// - The raw lines
+/// - Whether at least one newline was present in the untrimmed text.
+pub fn process_raw(raw: &str) -> Raw {
+    let (lang, inner) = split_after_lang_tag(raw);
+    let (lines, had_newline) = trim_and_split_raw(inner);
+    Raw { lang, lines, inline: !had_newline }
+}
+
+/// Parse the lang tag and return it alongside the remaining inner raw text.
+fn split_after_lang_tag(raw: &str) -> (Option<Ident>, &str) {
+    let mut lang = String::new();
+
+    let mut inner = raw;
     let mut iter = raw.chars();
-    let mut text = String::new();
 
     while let Some(c) = iter.next() {
-        if c == '\\' {
-            if let Some(c) = iter.next() {
-                if c != '\\' && c != '`' {
-                    text.push('\\');
-                }
-
-                text.push(c);
-            } else {
-                text.push('\\');
-            }
-        } else {
-            text.push(c);
+        if c == '`' || c.is_whitespace() || is_newline_char(c) {
+            break;
         }
+
+        inner = iter.as_str();
+        lang.push(c);
     }
 
-    split_lines(&text)
+    (Ident::new(lang), inner)
 }
 
-/// Resolves all escape sequences in code markup (between triple backticks) and splits it
-/// into into lines.
-pub fn unescape_code(raw: &str) -> Vec<String> {
-    let mut iter = raw.chars().peekable();
-    let mut text = String::new();
-    let mut backticks = 0u32;
-    let mut update_backtick_count;
-
-    while let Some(c) = iter.next() {
-        update_backtick_count = true;
-
-        if c == '\\' && backticks > 0 {
-            let mut tail = String::new();
-            let mut escape_success = false;
-            let mut backticks_after_slash = 0u32;
-
-            while let Some(&s) = iter.peek() {
-                match s {
-                    '\\' => {
-                        if backticks_after_slash == 0 {
-                            tail.push('\\');
-                        } else {
-                            // Pattern like `\`\` should fail
-                            // escape and just be printed verbantim.
-                            break;
-                        }
-                    }
-                    '`' => {
-                        tail.push(s);
-                        backticks_after_slash += 1;
-                        if backticks_after_slash == 2 {
-                            escape_success = true;
-                            iter.next();
-                            break;
-                        }
-                    }
-                    _ => break,
-                }
-
-                iter.next();
-            }
-
-            if !escape_success {
-                text.push(c);
-                backticks = backticks_after_slash;
-                update_backtick_count = false;
-            } else {
-                backticks = 0;
-            }
-
-            text.push_str(&tail);
-        } else {
-            text.push(c);
-        }
-
-        if update_backtick_count {
-            if c == '`' {
-                backticks += 1;
-            } else {
-                backticks = 0;
-            }
-        }
+/// Trims raw text and splits it into lines.
+///
+/// Returns whether at least one newline was contained in `raw`.
+fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
+    // Trims one whitespace at end and start.
+    let raw = raw.strip_prefix(' ').unwrap_or(raw);
+    let raw = raw.strip_suffix(' ').unwrap_or(raw);
+
+    let mut lines = split_lines(raw);
+    let had_newline = lines.len() > 1;
+    let is_whitespace = |line: &String| line.chars().all(char::is_whitespace);
+
+    // Trims a sequence of whitespace followed by a newline at the start.
+    if lines.first().map(is_whitespace).unwrap_or(false) {
+        lines.remove(0);
     }
 
-    split_lines(&text)
-}
+    // Trims a newline followed by a sequence of whitespace at the end.
+    if lines.last().map(is_whitespace).unwrap_or(false) {
+        lines.pop();
+    }
 
-/// Converts a hexademical sequence (without braces or "\u") into a character.
-pub fn hex_to_char(sequence: &str) -> Option<char> {
-    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+    (lines, had_newline)
 }
 
 /// Splits a string into a vector of lines (respecting Unicode & Windows line breaks).
@@ -175,12 +135,17 @@ pub fn split_lines(text: &str) -> Vec<String> {
     lines
 }
 
+/// Converts a hexademical sequence (without braces or "\u") into a character.
+pub fn hex_to_char(sequence: &str) -> Option<char> {
+    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
 #[cfg(test)]
+#[rustfmt::skip]
 mod tests {
     use super::*;
 
     #[test]
-    #[rustfmt::skip]
     fn test_unescape_strings() {
         fn test(string: &str, expected: &str) {
             assert_eq!(unescape_string(string), expected.to_string());
@@ -201,43 +166,48 @@ mod tests {
     }
 
     #[test]
-    #[rustfmt::skip]
-    fn test_unescape_raws() {
+    fn test_split_after_lang_tag() {
+        fn test(raw: &str, lang: Option<&str>, inner: &str) {
+            let (found_lang, found_inner) = split_after_lang_tag(raw);
+            assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang);
+            assert_eq!(found_inner, inner);
+        }
+
+        test("typst it!",   Some("typst"), " it!");
+        test("typst\n it!", Some("typst"), "\n it!");
+        test("typst\n it!", Some("typst"), "\n it!");
+        test("abc`",        Some("abc"),   "`");
+        test(" hi",         None,          " hi");
+        test("`",           None,          "`");
+    }
+
+    #[test]
+    fn test_trim_raw() {
         fn test(raw: &str, expected: Vec<&str>) {
-            assert_eq!(unescape_raw(raw), expected);
+            assert_eq!(trim_and_split_raw(raw).0, expected);
         }
 
-        test("raw\\`",     vec!["raw`"]);
-        test("raw\\\\`",   vec!["raw\\`"]);
-        test("raw\ntext",  vec!["raw", "text"]);
-        test("a\r\nb",     vec!["a", "b"]);
-        test("a\n\nb",     vec!["a", "", "b"]);
-        test("a\r\x0Bb",   vec!["a", "", "b"]);
-        test("a\r\n\r\nb", vec!["a", "", "b"]);
-        test("raw\\a",     vec!["raw\\a"]);
-        test("raw\\",      vec!["raw\\"]);
+        test(" hi",          vec!["hi"]);
+        test("  hi",         vec![" hi"]);
+        test("\nhi",         vec!["hi"]);
+        test("    \n hi",    vec![" hi"]);
+        test("hi ",          vec!["hi"]);
+        test("hi  ",         vec!["hi "]);
+        test("hi\n",         vec!["hi"]);
+        test("hi \n   ",     vec!["hi "]);
+        test("  \n hi \n  ", vec![" hi "]);
     }
 
     #[test]
-    #[rustfmt::skip]
-    fn test_unescape_code() {
+    fn test_split_lines() {
         fn test(raw: &str, expected: Vec<&str>) {
-            assert_eq!(unescape_code(raw), expected);
+            assert_eq!(split_lines(raw), expected);
         }
 
-        test("code\\`",       vec!["code\\`"]);
-        test("code`\\``",     vec!["code```"]);
-        test("code`\\`a",     vec!["code`\\`a"]);
-        test("code``hi`\\``", vec!["code``hi```"]);
-        test("code`\\\\``",   vec!["code`\\``"]);
-        test("code`\\`\\`go", vec!["code`\\`\\`go"]);
-        test("code`\\`\\``",  vec!["code`\\```"]);
-        test("code\ntext",    vec!["code", "text"]);
-        test("a\r\nb",        vec!["a", "b"]);
-        test("a\n\nb",        vec!["a", "", "b"]);
-        test("a\r\x0Bb",      vec!["a", "", "b"]);
-        test("a\r\n\r\nb",    vec!["a", "", "b"]);
-        test("code\\a",       vec!["code\\a"]);
-        test("code\\",        vec!["code\\"]);
+        test("raw\ntext",  vec!["raw", "text"]);
+        test("a\r\nb",     vec!["a", "b"]);
+        test("a\n\nb",     vec!["a", "", "b"]);
+        test("a\r\x0Bb",   vec!["a", "", "b"]);
+        test("a\r\n\r\nb", vec!["a", "", "b"]);
     }
 }
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
index bbd7ee1d..3446af83 100644
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@@ -99,35 +99,22 @@ impl Parser<'_> {
                 self.parse_heading().map(SyntaxNode::Heading)
             }
 
-            Token::Raw { raw, terminated } => {
+            Token::Raw { raw, backticks, terminated } => {
                 if !terminated {
-                    error!(@self.feedback, end, "expected backtick");
+                    error!(@self.feedback, end, "expected backtick(s)");
                 }
-                self.with_span(SyntaxNode::Raw(unescape_raw(raw)))
-            }
 
-            Token::Code { lang, raw, terminated } => {
-                if !terminated {
-                    error!(@self.feedback, end, "expected backticks");
-                }
-
-                let lang = lang.and_then(|lang| {
-                    if let Some(ident) = Ident::new(lang.v) {
-                        Some(ident.span_with(lang.span))
-                    } else {
-                        error!(@self.feedback, lang.span, "invalid identifier");
-                        None
+                let raw = if backticks > 1 {
+                    process_raw(raw)
+                } else {
+                    Raw {
+                        lang: None,
+                        lines: split_lines(raw),
+                        inline: true,
                     }
-                });
-
-                let mut lines = unescape_code(raw);
-                let block = lines.len() > 1;
-
-                if lines.last().map(|s| s.is_empty()).unwrap_or(false) {
-                    lines.pop();
-                }
+                };
 
-                self.with_span(SyntaxNode::Code(Code { lang, lines, block }))
+                self.with_span(SyntaxNode::Raw(raw))
             }
 
             Token::Text(text) => self.with_span(SyntaxNode::Text(text.to_string())),
diff --git a/src/parse/tests.rs b/src/parse/tests.rs
index d663aa2a..8ddf013d 100644
--- a/src/parse/tests.rs
+++ b/src/parse/tests.rs
@@ -29,24 +29,17 @@ macro_rules! H {
 }
 
 macro_rules! R {
-    ($($line:expr),* $(,)?) => {
-        SyntaxNode::Raw(vec![$($line.to_string()),*])
-    };
-}
-
-macro_rules! C {
-    ($lang:expr, $($line:expr),* $(,)?) => {{
-        let lines = vec![$($line.to_string()) ,*];
-        SyntaxNode::Code(Code {
+    ($lang:expr, $inline:expr, $($line:expr),* $(,)?) => {{
+        SyntaxNode::Raw(Raw {
             lang: $lang,
-            block: lines.len() > 1,
-            lines,
+            lines: vec![$($line.to_string()) ,*],
+            inline: $inline,
         })
     }};
 }
 
-fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<Ident>> {
-    Some(Into::<Spanned<&str>>::into(lang).map(|s| Ident(s.to_string())))
+fn Lang(lang: &str) -> Option<Ident> {
+    Some(Ident(lang.to_string()))
 }
 
 macro_rules! F {
@@ -220,19 +213,7 @@ fn test_parse_simple_nodes() {
     t!("\\u{1f303}"     => T("🌃"));
     t!("\n\n\nhello"    => P, T("hello"));
     t!(r"a\ b"          => T("a"), L, S, T("b"));
-    t!("`py`"           => R!["py"]);
-    t!("`hi\nyou"       => R!["hi", "you"]);
-    e!("`hi\nyou"       => s(7, 7, "expected backtick"));
-    t!("`hi\\`du`"      => R!["hi`du"]);
-
-    ts!("```java out```" => s(0, 14, C![Lang(s(3, 7, "java")), "out"]));
-    t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
-    t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
-        Lang("typst"), " Typst uses ``` to indicate code blocks"
-    ]);
 
-    e!("``` hi\nyou"      => s(10, 10, "expected backticks"));
-    e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier"));
     e!("\\u{d421c809}"    => s(0, 12, "invalid unicode escape sequence"));
     e!("\\u{abc"          => s(6, 6, "expected closing brace"));
     t!("💜\n\n 🌍"       => T("💜"), P, T("🌍"));
@@ -243,6 +224,33 @@ fn test_parse_simple_nodes() {
 }
 
 #[test]
+fn test_parse_raw() {
+    t!("`py`"            => R![None, true, "py"]);
+    t!("`hi\nyou"        => R![None, true, "hi", "you"]);
+    t!(r"`` hi\`du``"    => R![None, true, r"hi\`du"]);
+
+    // More than one backtick with optional language tag.
+    t!("``` console.log(\n\"alert\"\n)" => R![None, false, "console.log(", "\"alert\"", ")"]);
+    t!("````typst \r\n Typst uses ``` to indicate code blocks````!"
+        => R![Lang("typst"), false, " Typst uses ``` to indicate code blocks"], T("!"));
+
+    // Trimming of whitespace.
+    t!("`` a ``"         => R![None, true, "a"]);
+    t!("`` a  ``"        => R![None, true, "a "]);
+    t!("`` ` ``"         => R![None, true, "`"]);
+    t!("```  `   ```"    => R![None, true, " `  "]);
+    t!("```  `   \n ```" => R![None, false, " `   "]);
+
+    // Errors.
+    e!("`hi\nyou"         => s(7, 7, "expected backtick(s)"));
+    e!("``` hi\nyou"      => s(10, 10, "expected backtick(s)"));
+
+    // TODO: Bring back when spans/errors are in place.
+    // ts!("``java out``" => s(0, 12, R![Lang(s(2, 6, "java")), true, "out"]));
+    // e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier"));
+}
+
+#[test]
 fn test_parse_comments() {
     // In body.
     t!("hi// you\nw"          => T("hi"), S, T("w"));
@@ -348,7 +356,7 @@ fn test_parse_function_bodies() {
     e!(" [val][ */]"    => s(8, 10, "unexpected end of block comment"));
 
     // Raw in body.
-    t!("[val][`Hi]`" => F!("val"; Tree![R!["Hi]"]]));
+    t!("[val][`Hi]`" => F!("val"; Tree![R![None, true, "Hi]"]]));
     e!("[val][`Hi]`" => s(11, 11, "expected closing bracket"));
 
     // Crazy.
diff --git a/src/parse/tokenizer.rs b/src/parse/tokenizer.rs
index 92d15edc..720bec43 100644
--- a/src/parse/tokenizer.rs
+++ b/src/parse/tokenizer.rs
@@ -56,7 +56,7 @@ impl<'s> Tokens<'s> {
     /// The position in the string at which the last token ends and next token
     /// will start.
     pub fn pos(&self) -> Pos {
-        Pos(self.index as u32)
+        self.index.into()
     }
 }
 
@@ -111,7 +111,7 @@ impl<'s> Iterator for Tokens<'s> {
 
             // Style toggles.
             '_' if self.mode == Body => Underscore,
-            '`' if self.mode == Body => self.read_raw_or_code(),
+            '`' if self.mode == Body => self.read_raw(),
 
             // Sections.
             '#' if self.mode == Body => Hashtag,
@@ -230,66 +230,31 @@ impl<'s> Tokens<'s> {
         Str { string, terminated }
     }
 
-    fn read_raw_or_code(&mut self) -> Token<'s> {
-        let (raw, terminated) = self.read_until_unescaped('`');
-        if raw.is_empty() && terminated && self.peek() == Some('`') {
-            // Third tick found; this is a code block.
+    fn read_raw(&mut self) -> Token<'s> {
+        let mut backticks = 1;
+        while self.peek() == Some('`') {
             self.eat();
+            backticks += 1;
+        }
 
-            // Reads the lang tag (until newline or whitespace).
-            let start = self.pos();
-            let (lang, _) = self.read_string_until(false, 0, 0, |c| {
-                c == '`' || c.is_whitespace() || is_newline_char(c)
-            });
-            let end = self.pos();
-
-            let lang = if !lang.is_empty() {
-                Some(lang.span_with(Span::new(start, end)))
-            } else {
-                None
-            };
-
-            // Skip to start of raw contents.
-            while let Some(c) = self.peek() {
-                if is_newline_char(c) {
-                    self.eat();
-                    if c == '\r' && self.peek() == Some('\n') {
-                        self.eat();
-                    }
-
-                    break;
-                } else if c.is_whitespace() {
-                    self.eat();
-                } else {
-                    break;
-                }
-            }
-
-            let start = self.index;
-            let mut backticks = 0u32;
+        let start = self.index;
 
-            while backticks < 3 {
-                match self.eat() {
-                    Some('`') => backticks += 1,
-                    // Escaping of triple backticks.
-                    Some('\\') if backticks == 1 && self.peek() == Some('`') => {
-                        backticks = 0;
-                    }
-                    Some(_) => {}
-                    None => break,
-                }
+        let mut found = 0;
+        while found < backticks {
+            match self.eat() {
+                Some('`') => found += 1,
+                Some(_) => found = 0,
+                None => break,
             }
+        }
 
-            let terminated = backticks == 3;
-            let end = self.index - if terminated { 3 } else { 0 };
+        let terminated = found == backticks;
+        let end = self.index - if terminated { found } else { 0 };
 
-            Code {
-                lang,
-                raw: &self.src[start .. end],
-                terminated,
-            }
-        } else {
-            Raw { raw, terminated }
+        Raw {
+            raw: &self.src[start .. end],
+            backticks,
+            terminated,
         }
     }
 
@@ -469,18 +434,8 @@ mod tests {
     fn Str(string: &str, terminated: bool) -> Token {
         Token::Str { string, terminated }
     }
-    fn Raw(raw: &str, terminated: bool) -> Token {
-        Token::Raw { raw, terminated }
-    }
-    fn Code<'a>(
-        lang: Option<Spanned<&'a str>>,
-        raw: &'a str,
-        terminated: bool,
-    ) -> Token<'a> {
-        Token::Code { lang, raw, terminated }
-    }
-    fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<&'a str>> {
-        Some(Into::<Spanned<&str>>::into(lang))
+    fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token {
+        Token::Raw { raw, backticks, terminated }
     }
     fn UE(sequence: &str, terminated: bool) -> Token {
         Token::UnicodeEscape { sequence, terminated }
@@ -535,21 +490,33 @@ mod tests {
         t!(Body, "***"           => Star, Star, Star);
         t!(Body, "[func]*bold*"  => L, T("func"), R, Star, T("bold"), Star);
         t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there"));
-        t!(Body, "`raw`"         => Raw("raw", true));
         t!(Body, "# hi"          => Hashtag, S(0), T("hi"));
         t!(Body, "#()"           => Hashtag, T("()"));
-        t!(Body, "`[func]`"      => Raw("[func]", true));
-        t!(Body, "`]"            => Raw("]", false));
-        t!(Body, "\\ "           => Backslash, S(0));
-        t!(Body, "`\\``"         => Raw("\\`", true));
-        t!(Body, "``not code`"   => Raw("", true), T("not"), S(0), T("code"), Raw("", false));
-        t!(Body, "```rust hi```" => Code(Lang("rust"), "hi", true));
-        t!(Body, "``` hi`\\``"   => Code(None, "hi`\\``", false));
-        t!(Body, "```js   \r\n  document.write(\"go\")" => Code(Lang("js"), "  document.write(\"go\")", false));
         t!(Header, "_`"          => Invalid("_`"));
     }
 
     #[test]
+    fn test_tokenize_raw() {
+        // Basics.
+        t!(Body, "`raw`"    => Raw("raw", 1, true));
+        t!(Body, "`[func]`" => Raw("[func]", 1, true));
+        t!(Body, "`]"       => Raw("]", 1, false));
+        t!(Body, r"`\`` "   => Raw(r"\", 1, true), Raw(" ", 1, false));
+
+        // Language tag.
+        t!(Body, "``` hi```"     => Raw(" hi", 3, true));
+        t!(Body, "```rust hi```" => Raw("rust hi", 3, true));
+        t!(Body, r"``` hi\````"  => Raw(r" hi\", 3, true), Raw("", 1, false));
+        t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true));
+        t!(Body, "```js   \r\n  document.write(\"go\")`"
+            => Raw("js   \r\n  document.write(\"go\")`", 3, false));
+
+        // More backticks.
+        t!(Body, "`````` ``````hi"  => Raw(" ", 6, true), T("hi"));
+        t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true));
+    }
+
+    #[test]
     fn tokenize_header_only_tokens() {
         t!(Body, "a: b"                => T("a:"), S(0), T("b"));
         t!(Body, "c=d, "               => T("c=d,"), S(0));
author	Laurenz <laurmaedje@gmail.com>	2020-09-30 18:59:33 +0200
committer	Laurenz <laurmaedje@gmail.com>	2020-09-30 18:59:33 +0200
commit	4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch)
tree	70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse
parent	7cc279f7ae122f4c40592004dde89792c636b3c8 (diff)