summaryrefslogtreecommitdiff
path: root/src/syntax/tokens.rs
diff options
context:
space:
mode:
authorMartin Haug <mhaug@live.de>2020-08-30 13:21:07 +0200
committerMartin Haug <mhaug@live.de>2020-08-30 13:21:07 +0200
commit7041e0938dd17e9a1777857459df4e8ad29b6c4a (patch)
tree431d3c983e70bf0a0f9fbb36796a7197f8e64861 /src/syntax/tokens.rs
parent2a6cde72726c057e2166fb4277b8fe53c398b3f9 (diff)
Added Unicode Escaping for body text 👙
Diffstat (limited to 'src/syntax/tokens.rs')
-rw-r--r--src/syntax/tokens.rs61
1 files changed, 53 insertions, 8 deletions
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index 7ecb05fe..f00c1b66 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -82,6 +82,9 @@ pub enum Token<'s> {
/// A backslash followed by whitespace in text.
Backslash,
+ /// A unicode escape sequence
+ UnicodeEscape(&'s str),
+
/// Raw text.
Raw {
/// The raw text (not yet unescaped as for strings).
@@ -136,6 +139,7 @@ impl<'s> Token<'s> {
Star => "star",
Underscore => "underscore",
Backslash => "backslash",
+ UnicodeEscape(_) => "unicode escape sequence",
Raw { .. } => "raw text",
Code { .. } => "code block",
Text(_) => "text",
@@ -426,6 +430,41 @@ impl<'s> Tokens<'s> {
}
match self.peek() {
+ Some(c) if c == 'u' => {
+ // Index which points to start of escape sequence
+ let index = self.index() - 1;
+ self.eat();
+
+ if self.peek() == Some('{') {
+ self.eat();
+ // This loop will eat all hexadecimal chars and an
+ // optional closing brace (brace not in end index range).
+ let mut end = self.index();
+ let mut valid = true;
+ while let Some(c) = self.peek() {
+ if c == '}' {
+ self.eat();
+ break;
+ }
+
+ if !c.is_ascii_hexdigit() {
+ valid = false;
+ break;
+ }
+
+ self.eat();
+ end = self.index();
+ }
+ if valid == false {
+ // There are only 8-bit ASCII chars in that range
+ Text(&self.src[index..end])
+ } else {
+ UnicodeEscape(&self.src[index + 3..end])
+ }
+ } else {
+ Text("\\u")
+ }
+ }
Some(c) if is_escapable(c) => {
let index = self.index();
self.eat();
@@ -579,6 +618,7 @@ mod tests {
Plus,
Hyphen as Min,
Slash,
+ UnicodeEscape as UE,
Star,
Text as T,
};
@@ -701,14 +741,16 @@ mod tests {
#[test]
fn tokenize_escaped_symbols() {
- t!(Body, r"\\" => T(r"\"));
- t!(Body, r"\[" => T("["));
- t!(Body, r"\]" => T("]"));
- t!(Body, r"\*" => T("*"));
- t!(Body, r"\_" => T("_"));
- t!(Body, r"\`" => T("`"));
- t!(Body, r"\/" => T("/"));
- t!(Body, r#"\""# => T("\""));
+ t!(Body, r"\\" => T(r"\"));
+ t!(Body, r"\[" => T("["));
+ t!(Body, r"\]" => T("]"));
+ t!(Body, r"\*" => T("*"));
+ t!(Body, r"\_" => T("_"));
+ t!(Body, r"\`" => T("`"));
+ t!(Body, r"\/" => T("/"));
+ t!(Body, r"\u{2603}" => UE("2603"));
+ t!(Body, r"\u{26A4" => UE("26A4"));
+ t!(Body, r#"\""# => T("\""));
}
#[test]
@@ -716,6 +758,9 @@ mod tests {
t!(Body, r"\a" => T("\\"), T("a"));
t!(Body, r"\:" => T(r"\"), T(":"));
t!(Body, r"\=" => T(r"\"), T("="));
+ t!(Body, r"\u{2GA4"=> T(r"\u{2"), Text("GA4"));
+ t!(Body, r"\u{ " => T(r"\u{"), Space(0));
+ t!(Body, r"\u" => T(r"\u"));
t!(Header, r"\\\\" => Invalid(r"\\\\"));
t!(Header, r"\a" => Invalid(r"\a"));
t!(Header, r"\:" => Invalid(r"\"), Colon);