Merge pull request #12 from typst/unicode-escape

Unicode escape sequences in strings and body text
author: Laurenz <laurmaedje@gmail.com> 2020-08-30 15:22:56 +0200
committer: GitHub <noreply@github.com> 2020-08-30 15:22:56 +0200
commit: 07f387d08825cd3de4ea0965b2564ce8f0c4a166 (patch)
tree: 3a5f06e3bd713ce2a82a7a166b6256fac103c33e /src
parent: f7f255d5ea4262e545fd33dbb910d683e5d738ff (diff)
parent: 9861a9583eb52f5d66a7a350e5c0bd23985b0414 (diff)
2 files changed, 110 insertions, 18 deletions
diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs
index 0d12f6e1..e35835c8 100644
--- a/src/syntax/parsing.rs
+++ b/src/syntax/parsing.rs
@@ -110,6 +110,26 @@ impl Parser<'_> {
                     self.with_span(SyntaxNode::Text(text.to_string()))
                 }
 
+                Token::UnicodeEscape { sequence, terminated } => {
+                    if !terminated {
+                        error!(
+                            @self.feedback, Span::at(token.span.end),
+                            "expected closing brace",
+                        );
+                    }
+
+                    if let Some(c) = unescape_char(sequence) {
+                        self.with_span(SyntaxNode::Text(c.to_string()))
+                    } else {
+                        self.eat();
+                        error!(
+                            @self.feedback, token.span,
+                            "invalid unicode escape sequence",
+                        );
+                        continue;
+                    }
+                }
+
                 unexpected => {
                     self.eat();
                     error!(
@@ -594,7 +614,7 @@ impl Group {
 }
 
 fn unescape_string(string: &str) -> String {
-    let mut iter = string.chars();
+    let mut iter = string.chars().peekable();
     let mut out = String::with_capacity(string.len());
 
     while let Some(c) = iter.next() {
@@ -602,6 +622,36 @@ fn unescape_string(string: &str) -> String {
             match iter.next() {
                 Some('\\') => out.push('\\'),
                 Some('"') => out.push('"'),
+                Some('u') if iter.peek() == Some(&'{') => {
+                    iter.next();
+
+                    let mut sequence = String::new();
+                    let terminated = loop {
+                        match iter.peek() {
+                            // TODO: Feedback that closing brace is missing.
+                            Some('}') => {
+                                iter.next();
+                                break true;
+                            }
+                            Some(&c) if c.is_ascii_hexdigit() => {
+                                iter.next();
+                                sequence.push(c);
+                            }
+                            _ => break false,
+                        }
+                    };
+
+                    // TODO: Feedback that escape sequence is wrong.
+                    if let Some(c) = unescape_char(&sequence) {
+                        out.push(c);
+                    } else {
+                        out.push_str("\\u{");
+                        out.push_str(&sequence);
+                        if terminated {
+                            out.push('}');
+                        }
+                    }
+                }
                 Some('n') => out.push('\n'),
                 Some('t') => out.push('\t'),
                 Some(c) => { out.push('\\'); out.push(c); }
@@ -617,7 +667,7 @@ fn unescape_string(string: &str) -> String {
 
 /// Unescape raw markup and split it into into lines.
 fn unescape_raw(raw: &str) -> Vec<String> {
-    let mut iter = raw.chars().peekable();
+    let mut iter = raw.chars();
     let mut text = String::new();
 
     while let Some(c) = iter.next() {
@@ -705,6 +755,11 @@ fn unescape_code(raw: &str) -> Vec<String> {
     split_lines(&text)
 }
 
+/// Converts a hexademical sequence (without braces or "\u") into a character.
+fn unescape_char(sequence: &str) -> Option<char> {
+    u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
 fn split_lines(text: &str) -> Vec<String> {
     let mut iter = text.chars().peekable();
     let mut line = String::new();
@@ -890,6 +945,9 @@ mod tests {
         test(r#"hello world"#,  "hello world");
         test(r#"hello\nworld"#, "hello\nworld");
         test(r#"a\"bc"#,        "a\"bc");
+        test(r#"a\u{2603}bc"#,  "a☃bc");
+        test(r#"a\u{26c3bg"#,   "a𦰻g");
+        test(r#"av\u{6797"#,    "av林");
         test(r#"a\\"#,          "a\\");
         test(r#"a\\\nbc"#,      "a\\\nbc");
         test(r#"a\tbc"#,        "a\tbc");
@@ -944,6 +1002,7 @@ mod tests {
         t!("*hi"         => B, T("hi"));
         t!("hi_"         => T("hi"), I);
         t!("hi you"      => T("hi"), S, T("you"));
+        t!("\\u{1f303}"  => T("🌃"));
         t!("\n\n\nhello" => P, T("hello"));
         t!(r"a\ b"       => T("a"), L, S, T("b"));
         t!("`py`"        => R!["py"]);
@@ -951,17 +1010,16 @@ mod tests {
         e!("`hi\nyou"    => s(1,3, 1,3, "expected backtick"));
         t!("`hi\\`du`"   => R!["hi`du"]);
 
-        t!("```java System.out.print```" => C![
-            Some("java"), "System.out.print"
-        ]);
-        t!("``` console.log(\n\"alert\"\n)" => C![
-            None, "console.log(", "\"alert\"", ")"
-        ]);
+        t!("```java System.out.print```" => C![Some("java"), "System.out.print"]);
+        t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
         t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
             Some("typst"), " Typst uses ``` to indicate code blocks"
         ]);
-        e!("``` hi\nyou"      => s(1,3, 1,3, "expected backticks"));
-        e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
+
+        e!("``` hi\nyou"      => s(1,3, 1,3,  "expected backticks"));
+        e!("```🌍 hi\nyou```" => s(0,3, 0,4,  "invalid identifier"));
+        e!("\\u{d421c809}"    => s(0,0, 0,12, "invalid unicode escape sequence"));
+        e!("\\u{abc"          => s(0,6, 0,6, "expected closing brace"));
         t!("💜\n\n 🌍"       => T("💜"), P, T("🌍"));
 
         ts!("hi"   => s(0,0, 0,2, T("hi")));
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index 7ecb05fe..fe20d11a 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -82,6 +82,14 @@ pub enum Token<'s> {
     /// A backslash followed by whitespace in text.
     Backslash,
 
+    /// A unicode escape sequence.
+    UnicodeEscape {
+        /// The escape sequence between two braces.
+        sequence: &'s str,
+        /// Whether the closing brace was present.
+        terminated: bool,
+    },
+
     /// Raw text.
     Raw {
         /// The raw text (not yet unescaped as for strings).
@@ -136,6 +144,7 @@ impl<'s> Token<'s> {
             Star => "star",
             Underscore => "underscore",
             Backslash => "backslash",
+            UnicodeEscape { .. } => "unicode escape sequence",
             Raw { .. } => "raw text",
             Code { .. } => "code block",
             Text(_) => "text",
@@ -426,6 +435,25 @@ impl<'s> Tokens<'s> {
         }
 
         match self.peek() {
+            Some('u') => {
+                self.eat();
+                if self.peek() == Some('{') {
+                    self.eat();
+                    let sequence = self.read_string_until(
+                        |c| !c.is_ascii_hexdigit(),
+                        false, 0, 0,
+                    ).0;
+
+                    let terminated = self.peek() == Some('}');
+                    if terminated {
+                        self.eat();
+                    }
+
+                    UnicodeEscape { sequence, terminated }
+                } else {
+                    Text("\\u")
+                }
+            }
             Some(c) if is_escapable(c) => {
                 let index = self.index();
                 self.eat();
@@ -588,6 +616,7 @@ mod tests {
     fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> {
         Token::Code { lang: lang.map(Spanned::zero), raw, terminated }
     }
+    fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } }
 
     macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
     macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
@@ -701,14 +730,16 @@ mod tests {
 
     #[test]
     fn tokenize_escaped_symbols() {
-        t!(Body, r"\\"   => T(r"\"));
-        t!(Body, r"\["   => T("["));
-        t!(Body, r"\]"   => T("]"));
-        t!(Body, r"\*"   => T("*"));
-        t!(Body, r"\_"   => T("_"));
-        t!(Body, r"\`"   => T("`"));
-        t!(Body, r"\/"   => T("/"));
-        t!(Body, r#"\""# => T("\""));
+        t!(Body, r"\\"       => T(r"\"));
+        t!(Body, r"\["       => T("["));
+        t!(Body, r"\]"       => T("]"));
+        t!(Body, r"\*"       => T("*"));
+        t!(Body, r"\_"       => T("_"));
+        t!(Body, r"\`"       => T("`"));
+        t!(Body, r"\/"       => T("/"));
+        t!(Body, r"\u{2603}" => UE("2603", true));
+        t!(Body, r"\u{26A4"  => UE("26A4", false));
+        t!(Body, r#"\""#     => T("\""));
     }
 
     #[test]
@@ -716,6 +747,9 @@ mod tests {
         t!(Body, r"\a"     => T("\\"), T("a"));
         t!(Body, r"\:"     => T(r"\"), T(":"));
         t!(Body, r"\="     => T(r"\"), T("="));
+        t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4"));
+        t!(Body, r"\u{ "   => UE("", false), Space(0));
+        t!(Body, r"\u"     => T(r"\u"));
         t!(Header, r"\\\\" => Invalid(r"\\\\"));
         t!(Header, r"\a"   => Invalid(r"\a"));
         t!(Header, r"\:"   => Invalid(r"\"), Colon);
author	Laurenz <laurmaedje@gmail.com>	2020-08-30 15:22:56 +0200
committer	GitHub <noreply@github.com>	2020-08-30 15:22:56 +0200
commit	07f387d08825cd3de4ea0965b2564ce8f0c4a166 (patch)
tree	3a5f06e3bd713ce2a82a7a166b6256fac103c33e /src
parent	f7f255d5ea4262e545fd33dbb910d683e5d738ff (diff)
parent	9861a9583eb52f5d66a7a350e5c0bd23985b0414 (diff)