Refactor raw blocks 💱

author: Laurenz <laurmaedje@gmail.com> 2020-09-30 18:59:33 +0200
committer: Laurenz <laurmaedje@gmail.com> 2020-09-30 18:59:33 +0200
commit: 4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch)
tree: 70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse/escaping.rs
parent: 7cc279f7ae122f4c40592004dde89792c636b3c8 (diff)
1 files changed, 84 insertions, 114 deletions
diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs
index 55b1fe67..a2ff963b 100644
--- a/src/parse/escaping.rs
+++ b/src/parse/escaping.rs
@@ -1,4 +1,5 @@
 use super::is_newline_char;
+use crate::syntax::{Ident, Raw};
 
 /// Resolves all escape sequences in a string.
 pub fn unescape_string(string: &str) -> String {
@@ -56,101 +57,60 @@ pub fn unescape_string(string: &str) -> String {
     out
 }
 
-/// Resolves all escape sequences in raw markup (between backticks) and splits it into
-/// into lines.
-pub fn unescape_raw(raw: &str) -> Vec<String> {
+/// Resolves the language tag and trims the raw text.
+///
+/// Returns:
+/// - The language tag
+/// - The raw lines
+/// - Whether at least one newline was present in the untrimmed text.
+pub fn process_raw(raw: &str) -> Raw {
+    let (lang, inner) = split_after_lang_tag(raw);
+    let (lines, had_newline) = trim_and_split_raw(inner);
+    Raw { lang, lines, inline: !had_newline }
+}
+
+/// Parse the lang tag and return it alongside the remaining inner raw text.
+fn split_after_lang_tag(raw: &str) -> (Option<Ident>, &str) {
+    let mut lang = String::new();
+
+    let mut inner = raw;
     let mut iter = raw.chars();
-    let mut text = String::new();
 
     while let Some(c) = iter.next() {
-        if c == '\\' {
-            if let Some(c) = iter.next() {
-                if c != '\\' && c != '`' {
-                    text.push('\\');
-                }
-
-                text.push(c);
-            } else {
-                text.push('\\');
-            }
-        } else {
-            text.push(c);
+        if c == '`' || c.is_whitespace() || is_newline_char(c) {
+            break;
         }
+
+        inner = iter.as_str();
+        lang.push(c);
     }
 
-    split_lines(&text)
+    (Ident::new(lang), inner)
 }
 
-/// Resolves all escape sequences in code markup (between triple backticks) and splits it
-/// into into lines.
-pub fn unescape_code(raw: &str) -> Vec<String> {
-    let mut iter = raw.chars().peekable();
-    let mut text = String::new();
-    let mut backticks = 0u32;
-    let mut update_backtick_count;
-
-    while let Some(c) = iter.next() {
-        update_backtick_count = true;
-
-        if c == '\\' && backticks > 0 {
-            let mut tail = String::new();
-            let mut escape_success = false;
-            let mut backticks_after_slash = 0u32;
-
-            while let Some(&s) = iter.peek() {
-                match s {
-                    '\\' => {
-                        if backticks_after_slash == 0 {
-                            tail.push('\\');
-                        } else {
-                            // Pattern like `\`\` should fail
-                            // escape and just be printed verbantim.
-                            break;
-                        }
-                    }
-                    '`' => {
-                        tail.push(s);
-                        backticks_after_slash += 1;
-                        if backticks_after_slash == 2 {
-                            escape_success = true;
-                            iter.next();
-                            break;
-                        }
-                    }
-                    _ => break,
-                }
-
-                iter.next();
-            }
-
-            if !escape_success {
-                text.push(c);
-                backticks = backticks_after_slash;
-                update_backtick_count = false;
-            } else {
-                backticks = 0;
-            }
-
-            text.push_str(&tail);
-        } else {
-            text.push(c);
-        }
-
-        if update_backtick_count {
-            if c == '`' {
-                backticks += 1;
-            } else {
-                backticks = 0;
-            }
-        }
+/// Trims raw text and splits it into lines.
+///
+/// Returns whether at least one newline was contained in `raw`.
+fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
+    // Trims one whitespace at end and start.
+    let raw = raw.strip_prefix(' ').unwrap_or(raw);
+    let raw = raw.strip_suffix(' ').unwrap_or(raw);
+
+    let mut lines = split_lines(raw);
+    let had_newline = lines.len() > 1;
+    let is_whitespace = |line: &String| line.chars().all(char::is_whitespace);
+
+    // Trims a sequence of whitespace followed by a newline at the start.
+    if lines.first().map(is_whitespace).unwrap_or(false) {
+        lines.remove(0);
     }
 
-    split_lines(&text)
-}
+    // Trims a newline followed by a sequence of whitespace at the end.
+    if lines.last().map(is_whitespace).unwrap_or(false) {
+        lines.pop();
+    }
 
-/// Converts a hexademical sequence (without braces or "\u") into a character.
-pub fn hex_to_char(sequence: &str) -> Option<char> {
-    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+    (lines, had_newline)
 }
 
 /// Splits a string into a vector of lines (respecting Unicode & Windows line breaks).
@@ -175,12 +135,17 @@ pub fn split_lines(text: &str) -> Vec<String> {
     lines
 }
 
+/// Converts a hexademical sequence (without braces or "\u") into a character.
+pub fn hex_to_char(sequence: &str) -> Option<char> {
+    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
 #[cfg(test)]
+#[rustfmt::skip]
 mod tests {
     use super::*;
 
     #[test]
-    #[rustfmt::skip]
     fn test_unescape_strings() {
         fn test(string: &str, expected: &str) {
             assert_eq!(unescape_string(string), expected.to_string());
@@ -201,43 +166,48 @@ mod tests {
     }
 
     #[test]
-    #[rustfmt::skip]
-    fn test_unescape_raws() {
+    fn test_split_after_lang_tag() {
+        fn test(raw: &str, lang: Option<&str>, inner: &str) {
+            let (found_lang, found_inner) = split_after_lang_tag(raw);
+            assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang);
+            assert_eq!(found_inner, inner);
+        }
+
+        test("typst it!",   Some("typst"), " it!");
+        test("typst\n it!", Some("typst"), "\n it!");
+        test("typst\n it!", Some("typst"), "\n it!");
+        test("abc`",        Some("abc"),   "`");
+        test(" hi",         None,          " hi");
+        test("`",           None,          "`");
+    }
+
+    #[test]
+    fn test_trim_raw() {
         fn test(raw: &str, expected: Vec<&str>) {
-            assert_eq!(unescape_raw(raw), expected);
+            assert_eq!(trim_and_split_raw(raw).0, expected);
         }
 
-        test("raw\\`",     vec!["raw`"]);
-        test("raw\\\\`",   vec!["raw\\`"]);
-        test("raw\ntext",  vec!["raw", "text"]);
-        test("a\r\nb",     vec!["a", "b"]);
-        test("a\n\nb",     vec!["a", "", "b"]);
-        test("a\r\x0Bb",   vec!["a", "", "b"]);
-        test("a\r\n\r\nb", vec!["a", "", "b"]);
-        test("raw\\a",     vec!["raw\\a"]);
-        test("raw\\",      vec!["raw\\"]);
+        test(" hi",          vec!["hi"]);
+        test("  hi",         vec![" hi"]);
+        test("\nhi",         vec!["hi"]);
+        test("    \n hi",    vec![" hi"]);
+        test("hi ",          vec!["hi"]);
+        test("hi  ",         vec!["hi "]);
+        test("hi\n",         vec!["hi"]);
+        test("hi \n   ",     vec!["hi "]);
+        test("  \n hi \n  ", vec![" hi "]);
     }
 
     #[test]
-    #[rustfmt::skip]
-    fn test_unescape_code() {
+    fn test_split_lines() {
         fn test(raw: &str, expected: Vec<&str>) {
-            assert_eq!(unescape_code(raw), expected);
+            assert_eq!(split_lines(raw), expected);
         }
 
-        test("code\\`",       vec!["code\\`"]);
-        test("code`\\``",     vec!["code```"]);
-        test("code`\\`a",     vec!["code`\\`a"]);
-        test("code``hi`\\``", vec!["code``hi```"]);
-        test("code`\\\\``",   vec!["code`\\``"]);
-        test("code`\\`\\`go", vec!["code`\\`\\`go"]);
-        test("code`\\`\\``",  vec!["code`\\```"]);
-        test("code\ntext",    vec!["code", "text"]);
-        test("a\r\nb",        vec!["a", "b"]);
-        test("a\n\nb",        vec!["a", "", "b"]);
-        test("a\r\x0Bb",      vec!["a", "", "b"]);
-        test("a\r\n\r\nb",    vec!["a", "", "b"]);
-        test("code\\a",       vec!["code\\a"]);
-        test("code\\",        vec!["code\\"]);
+        test("raw\ntext",  vec!["raw", "text"]);
+        test("a\r\nb",     vec!["a", "b"]);
+        test("a\n\nb",     vec!["a", "", "b"]);
+        test("a\r\x0Bb",   vec!["a", "", "b"]);
+        test("a\r\n\r\nb", vec!["a", "", "b"]);
     }
 }
author	Laurenz <laurmaedje@gmail.com>	2020-09-30 18:59:33 +0200
committer	Laurenz <laurmaedje@gmail.com>	2020-09-30 18:59:33 +0200
commit	4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch)
tree	70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse/escaping.rs
parent	7cc279f7ae122f4c40592004dde89792c636b3c8 (diff)