Added Unicode Escaping for body text 👙

author: Martin Haug <mhaug@live.de> 2020-08-30 13:21:07 +0200
committer: Martin Haug <mhaug@live.de> 2020-08-30 13:21:07 +0200
commit: 7041e0938dd17e9a1777857459df4e8ad29b6c4a (patch)
tree: 431d3c983e70bf0a0f9fbb36796a7197f8e64861 /src/syntax/tokens.rs
parent: 2a6cde72726c057e2166fb4277b8fe53c398b3f9 (diff)
1 files changed, 53 insertions, 8 deletions
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index 7ecb05fe..f00c1b66 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -82,6 +82,9 @@ pub enum Token<'s> {
     /// A backslash followed by whitespace in text.
     Backslash,
 
+    /// A unicode escape sequence
+    UnicodeEscape(&'s str),
+
     /// Raw text.
     Raw {
         /// The raw text (not yet unescaped as for strings).
@@ -136,6 +139,7 @@ impl<'s> Token<'s> {
             Star => "star",
             Underscore => "underscore",
             Backslash => "backslash",
+            UnicodeEscape(_) => "unicode escape sequence",
             Raw { .. } => "raw text",
             Code { .. } => "code block",
             Text(_) => "text",
@@ -426,6 +430,41 @@ impl<'s> Tokens<'s> {
         }
 
         match self.peek() {
+            Some(c) if c == 'u' => {
+                // Index which points to start of escape sequence
+                let index = self.index() - 1;
+                self.eat();
+
+                if self.peek() == Some('{') {
+                    self.eat();
+                    // This loop will eat all hexadecimal chars and an
+                    // optional closing brace (brace not in end index range).
+                    let mut end = self.index();
+                    let mut valid = true;
+                    while let Some(c) = self.peek() {
+                        if c == '}' {
+                            self.eat();
+                            break;
+                        }
+
+                        if !c.is_ascii_hexdigit() {
+                            valid = false;
+                            break;
+                        }
+
+                        self.eat();
+                        end = self.index();
+                    }
+                    if valid == false {
+                        // There are only 8-bit ASCII chars in that range
+                        Text(&self.src[index..end])
+                    } else {
+                        UnicodeEscape(&self.src[index + 3..end])
+                    }
+                } else {
+                    Text("\\u")
+                }
+            }
             Some(c) if is_escapable(c) => {
                 let index = self.index();
                 self.eat();
@@ -579,6 +618,7 @@ mod tests {
         Plus,
         Hyphen as Min,
         Slash,
+        UnicodeEscape as UE,
         Star,
         Text as T,
     };
@@ -701,14 +741,16 @@ mod tests {
 
     #[test]
     fn tokenize_escaped_symbols() {
-        t!(Body, r"\\"   => T(r"\"));
-        t!(Body, r"\["   => T("["));
-        t!(Body, r"\]"   => T("]"));
-        t!(Body, r"\*"   => T("*"));
-        t!(Body, r"\_"   => T("_"));
-        t!(Body, r"\`"   => T("`"));
-        t!(Body, r"\/"   => T("/"));
-        t!(Body, r#"\""# => T("\""));
+        t!(Body, r"\\"       => T(r"\"));
+        t!(Body, r"\["       => T("["));
+        t!(Body, r"\]"       => T("]"));
+        t!(Body, r"\*"       => T("*"));
+        t!(Body, r"\_"       => T("_"));
+        t!(Body, r"\`"       => T("`"));
+        t!(Body, r"\/"       => T("/"));
+        t!(Body, r"\u{2603}" => UE("2603"));
+        t!(Body, r"\u{26A4"  => UE("26A4"));
+        t!(Body, r#"\""#     => T("\""));
     }
 
     #[test]
@@ -716,6 +758,9 @@ mod tests {
         t!(Body, r"\a"     => T("\\"), T("a"));
         t!(Body, r"\:"     => T(r"\"), T(":"));
         t!(Body, r"\="     => T(r"\"), T("="));
+        t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4"));
+        t!(Body, r"\u{ "   => T(r"\u{"), Space(0));
+        t!(Body, r"\u"     => T(r"\u"));
         t!(Header, r"\\\\" => Invalid(r"\\\\"));
         t!(Header, r"\a"   => Invalid(r"\a"));
         t!(Header, r"\:"   => Invalid(r"\"), Colon);
author	Martin Haug <mhaug@live.de>	2020-08-30 13:21:07 +0200
committer	Martin Haug <mhaug@live.de>	2020-08-30 13:21:07 +0200
commit	7041e0938dd17e9a1777857459df4e8ad29b6c4a (patch)
tree	431d3c983e70bf0a0f9fbb36796a7197f8e64861 /src/syntax/tokens.rs
parent	2a6cde72726c057e2166fb4277b8fe53c398b3f9 (diff)