Merge some modules

author: Laurenz <laurmaedje@gmail.com> 2022-10-17 19:26:24 +0200
committer: Laurenz <laurmaedje@gmail.com> 2022-10-17 20:04:22 +0200
commit: e21822665591dc19766275da1e185215a6b945ef (patch)
tree: 7788e211c3c33c8b5a8ad7d5eb7574e33631eb16 /src/syntax/resolve.rs
parent: 4fd031a256b2ecfe524859d5599fafb386395572 (diff)
1 files changed, 237 insertions, 0 deletions
diff --git a/src/syntax/resolve.rs b/src/syntax/resolve.rs
new file mode 100644
index 00000000..2ad35cec
--- /dev/null
+++ b/src/syntax/resolve.rs
@@ -0,0 +1,237 @@
+use unscanny::Scanner;
+
+use super::{is_ident, is_newline, RawKind};
+use crate::util::EcoString;
+
+/// Resolve all escape sequences in a string.
+pub fn resolve_string(string: &str) -> EcoString {
+    let mut out = EcoString::with_capacity(string.len());
+    let mut s = Scanner::new(string);
+
+    while let Some(c) = s.eat() {
+        if c != '\\' {
+            out.push(c);
+            continue;
+        }
+
+        let start = s.locate(-1);
+        match s.eat() {
+            Some('\\') => out.push('\\'),
+            Some('"') => out.push('"'),
+            Some('n') => out.push('\n'),
+            Some('r') => out.push('\r'),
+            Some('t') => out.push('\t'),
+            Some('u') if s.eat_if('{') => {
+                // TODO: Error if closing brace is missing.
+                let sequence = s.eat_while(char::is_ascii_hexdigit);
+                let _terminated = s.eat_if('}');
+                match resolve_hex(sequence) {
+                    Some(c) => out.push(c),
+                    None => out.push_str(s.from(start)),
+                }
+            }
+
+            _ => out.push_str(s.from(start)),
+        }
+    }
+
+    out
+}
+
+/// Resolve a hexadecimal escape sequence into a character
+/// (only the inner hex letters without braces or `\u`).
+pub fn resolve_hex(sequence: &str) -> Option<char> {
+    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
+/// Resolve the language tag and trim the raw text.
+pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawKind {
+    if backticks > 1 {
+        let (tag, inner) = split_at_lang_tag(text);
+        let (text, block) = trim_and_split_raw(column, inner);
+        RawKind {
+            lang: is_ident(tag).then(|| tag.into()),
+            text: text.into(),
+            block,
+        }
+    } else {
+        RawKind {
+            lang: None,
+            text: split_lines(text).join("\n").into(),
+            block: false,
+        }
+    }
+}
+
+/// Parse the lang tag and return it alongside the remaining inner raw text.
+fn split_at_lang_tag(raw: &str) -> (&str, &str) {
+    let mut s = Scanner::new(raw);
+    (
+        s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)),
+        s.after(),
+    )
+}
+
+/// Trim raw text and splits it into lines.
+///
+/// Also returns whether at least one newline was contained in `raw`.
+fn trim_and_split_raw(column: usize, mut raw: &str) -> (String, bool) {
+    // Trims one space at the start.
+    raw = raw.strip_prefix(' ').unwrap_or(raw);
+
+    // Trim one space at the end if the last non-whitespace char is a backtick.
+    if raw.trim_end().ends_with('`') {
+        raw = raw.strip_suffix(' ').unwrap_or(raw);
+    }
+
+    let mut lines = split_lines(raw);
+
+    // Dedent based on column, but not for the first line.
+    for line in lines.iter_mut().skip(1) {
+        let offset = line
+            .chars()
+            .take(column)
+            .take_while(|c| c.is_whitespace())
+            .map(char::len_utf8)
+            .sum();
+        *line = &line[offset ..];
+    }
+
+    let had_newline = lines.len() > 1;
+    let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
+
+    // Trims a sequence of whitespace followed by a newline at the start.
+    if lines.first().map_or(false, is_whitespace) {
+        lines.remove(0);
+    }
+
+    // Trims a newline followed by a sequence of whitespace at the end.
+    if lines.last().map_or(false, is_whitespace) {
+        lines.pop();
+    }
+
+    (lines.join("\n"), had_newline)
+}
+
+/// Split a string into a vector of lines
+/// (respecting Unicode, Unix, Mac and Windows line breaks).
+fn split_lines(text: &str) -> Vec<&str> {
+    let mut s = Scanner::new(text);
+    let mut lines = Vec::new();
+    let mut start = 0;
+    let mut end = 0;
+
+    while let Some(c) = s.eat() {
+        if is_newline(c) {
+            if c == '\r' {
+                s.eat_if('\n');
+            }
+
+            lines.push(&text[start .. end]);
+            start = s.cursor();
+        }
+        end = s.cursor();
+    }
+
+    lines.push(&text[start ..]);
+    lines
+}
+
+#[cfg(test)]
+#[rustfmt::skip]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_resolve_strings() {
+        #[track_caller]
+        fn test(string: &str, expected: &str) {
+            assert_eq!(resolve_string(string), expected);
+        }
+
+        test(r#"hello world"#,  "hello world");
+        test(r#"hello\nworld"#, "hello\nworld");
+        test(r#"a\"bc"#,        "a\"bc");
+        test(r#"a\u{2603}bc"#,  "a☃bc");
+        test(r#"a\u{26c3bg"#,   "a𦰻g");
+        test(r#"av\u{6797"#,    "av林");
+        test(r#"a\\"#,          "a\\");
+        test(r#"a\\\nbc"#,      "a\\\nbc");
+        test(r#"a\t\r\nbc"#,    "a\t\r\nbc");
+        test(r"🌎",             "🌎");
+        test(r"🌎\",            r"🌎\");
+        test(r"\🌎",            r"\🌎");
+    }
+
+    #[test]
+    fn test_split_at_lang_tag() {
+        #[track_caller]
+        fn test(text: &str, lang: &str, inner: &str) {
+            assert_eq!(split_at_lang_tag(text), (lang, inner));
+        }
+
+        test("typst it!",   "typst", " it!");
+        test("typst\n it!", "typst", "\n it!");
+        test("typst\n it!", "typst", "\n it!");
+        test("abc`",        "abc",   "`");
+        test(" hi",         "",      " hi");
+        test("`",           "",      "`");
+    }
+
+    #[test]
+    fn test_resolve_raw() {
+        #[track_caller]
+        fn test(
+            column: usize,
+            backticks: usize,
+            raw: &str,
+            lang: Option<&str>,
+            text: &str,
+            block: bool,
+        ) {
+            let node = resolve_raw(column, backticks, raw);
+            assert_eq!(node.lang.as_deref(), lang);
+            assert_eq!(node.text, text);
+            assert_eq!(node.block, block);
+        }
+
+        // Just one backtick.
+        test(0, 1, "py",     None, "py",   false);
+        test(0, 1, "1\n2",   None, "1\n2", false);
+        test(0, 1, "1\r\n2", None, "1\n2", false);
+
+        // More than one backtick with lang tag.
+        test(0, 2, "js alert()",    Some("js"), "alert()",    false);
+        test(0, 3, "py quit(\n\n)", Some("py"), "quit(\n\n)", true);
+        test(0, 2, "♥",             None,       "",           false);
+
+        // Trimming of whitespace (tested more thoroughly in separate test).
+        test(0, 2, " a",   None, "a",  false);
+        test(0, 2, "  a",  None, " a", false);
+        test(0, 2, " \na", None, "a",  true);
+
+        // Dedenting
+        test(2, 3, " def foo():\n    bar()", None, "def foo():\n  bar()", true);
+    }
+
+    #[test]
+    fn test_trim_raw() {
+        #[track_caller]
+        fn test(text: &str, expected: &str) {
+            assert_eq!(trim_and_split_raw(0, text).0, expected);
+        }
+
+        test(" hi",          "hi");
+        test("  hi",         " hi");
+        test("\nhi",         "hi");
+        test("    \n hi",    " hi");
+        test("hi` ",         "hi`");
+        test("hi`  ",        "hi` ");
+        test("hi`   ",       "hi`  ");
+        test("hi ",          "hi ");
+        test("hi  ",         "hi  ");
+        test("hi\n",         "hi");
+        test("hi \n   ",     "hi ");
+        test("  \n hi \n  ", " hi ");
+    }
+}
author	Laurenz <laurmaedje@gmail.com>	2022-10-17 19:26:24 +0200
committer	Laurenz <laurmaedje@gmail.com>	2022-10-17 20:04:22 +0200
commit	e21822665591dc19766275da1e185215a6b945ef (patch)
tree	7788e211c3c33c8b5a8ad7d5eb7574e33631eb16 /src/syntax/resolve.rs
parent	4fd031a256b2ecfe524859d5599fafb386395572 (diff)