summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2020-08-30 15:22:56 +0200
committerGitHub <noreply@github.com>2020-08-30 15:22:56 +0200
commit07f387d08825cd3de4ea0965b2564ce8f0c4a166 (patch)
tree3a5f06e3bd713ce2a82a7a166b6256fac103c33e /src
parentf7f255d5ea4262e545fd33dbb910d683e5d738ff (diff)
parent9861a9583eb52f5d66a7a350e5c0bd23985b0414 (diff)
Merge pull request #12 from typst/unicode-escape
Unicode escape sequences in strings and body text
Diffstat (limited to 'src')
-rw-r--r--src/syntax/parsing.rs78
-rw-r--r--src/syntax/tokens.rs50
2 files changed, 110 insertions, 18 deletions
diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs
index 0d12f6e1..e35835c8 100644
--- a/src/syntax/parsing.rs
+++ b/src/syntax/parsing.rs
@@ -110,6 +110,26 @@ impl Parser<'_> {
self.with_span(SyntaxNode::Text(text.to_string()))
}
+ Token::UnicodeEscape { sequence, terminated } => {
+ if !terminated {
+ error!(
+ @self.feedback, Span::at(token.span.end),
+ "expected closing brace",
+ );
+ }
+
+ if let Some(c) = unescape_char(sequence) {
+ self.with_span(SyntaxNode::Text(c.to_string()))
+ } else {
+ self.eat();
+ error!(
+ @self.feedback, token.span,
+ "invalid unicode escape sequence",
+ );
+ continue;
+ }
+ }
+
unexpected => {
self.eat();
error!(
@@ -594,7 +614,7 @@ impl Group {
}
fn unescape_string(string: &str) -> String {
- let mut iter = string.chars();
+ let mut iter = string.chars().peekable();
let mut out = String::with_capacity(string.len());
while let Some(c) = iter.next() {
@@ -602,6 +622,36 @@ fn unescape_string(string: &str) -> String {
match iter.next() {
Some('\\') => out.push('\\'),
Some('"') => out.push('"'),
+ Some('u') if iter.peek() == Some(&'{') => {
+ iter.next();
+
+ let mut sequence = String::new();
+ let terminated = loop {
+ match iter.peek() {
+ // TODO: Feedback that closing brace is missing.
+ Some('}') => {
+ iter.next();
+ break true;
+ }
+ Some(&c) if c.is_ascii_hexdigit() => {
+ iter.next();
+ sequence.push(c);
+ }
+ _ => break false,
+ }
+ };
+
+ // TODO: Feedback that escape sequence is wrong.
+ if let Some(c) = unescape_char(&sequence) {
+ out.push(c);
+ } else {
+ out.push_str("\\u{");
+ out.push_str(&sequence);
+ if terminated {
+ out.push('}');
+ }
+ }
+ }
Some('n') => out.push('\n'),
Some('t') => out.push('\t'),
Some(c) => { out.push('\\'); out.push(c); }
@@ -617,7 +667,7 @@ fn unescape_string(string: &str) -> String {
/// Unescape raw markup and split it into into lines.
fn unescape_raw(raw: &str) -> Vec<String> {
- let mut iter = raw.chars().peekable();
+ let mut iter = raw.chars();
let mut text = String::new();
while let Some(c) = iter.next() {
@@ -705,6 +755,11 @@ fn unescape_code(raw: &str) -> Vec<String> {
split_lines(&text)
}
+/// Converts a hexademical sequence (without braces or "\u") into a character.
+fn unescape_char(sequence: &str) -> Option<char> {
+ u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
fn split_lines(text: &str) -> Vec<String> {
let mut iter = text.chars().peekable();
let mut line = String::new();
@@ -890,6 +945,9 @@ mod tests {
test(r#"hello world"#, "hello world");
test(r#"hello\nworld"#, "hello\nworld");
test(r#"a\"bc"#, "a\"bc");
+ test(r#"a\u{2603}bc"#, "a☃bc");
+ test(r#"a\u{26c3bg"#, "a𦰻g");
+ test(r#"av\u{6797"#, "av林");
test(r#"a\\"#, "a\\");
test(r#"a\\\nbc"#, "a\\\nbc");
test(r#"a\tbc"#, "a\tbc");
@@ -944,6 +1002,7 @@ mod tests {
t!("*hi" => B, T("hi"));
t!("hi_" => T("hi"), I);
t!("hi you" => T("hi"), S, T("you"));
+ t!("\\u{1f303}" => T("🌃"));
t!("\n\n\nhello" => P, T("hello"));
t!(r"a\ b" => T("a"), L, S, T("b"));
t!("`py`" => R!["py"]);
@@ -951,17 +1010,16 @@ mod tests {
e!("`hi\nyou" => s(1,3, 1,3, "expected backtick"));
t!("`hi\\`du`" => R!["hi`du"]);
- t!("```java System.out.print```" => C![
- Some("java"), "System.out.print"
- ]);
- t!("``` console.log(\n\"alert\"\n)" => C![
- None, "console.log(", "\"alert\"", ")"
- ]);
+ t!("```java System.out.print```" => C![Some("java"), "System.out.print"]);
+ t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
Some("typst"), " Typst uses ``` to indicate code blocks"
]);
- e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
- e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
+
+ e!("``` hi\nyou" => s(1,3, 1,3, "expected backticks"));
+ e!("```🌍 hi\nyou```" => s(0,3, 0,4, "invalid identifier"));
+ e!("\\u{d421c809}" => s(0,0, 0,12, "invalid unicode escape sequence"));
+ e!("\\u{abc" => s(0,6, 0,6, "expected closing brace"));
t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
ts!("hi" => s(0,0, 0,2, T("hi")));
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index 7ecb05fe..fe20d11a 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -82,6 +82,14 @@ pub enum Token<'s> {
/// A backslash followed by whitespace in text.
Backslash,
+ /// A unicode escape sequence.
+ UnicodeEscape {
+ /// The escape sequence between two braces.
+ sequence: &'s str,
+ /// Whether the closing brace was present.
+ terminated: bool,
+ },
+
/// Raw text.
Raw {
/// The raw text (not yet unescaped as for strings).
@@ -136,6 +144,7 @@ impl<'s> Token<'s> {
Star => "star",
Underscore => "underscore",
Backslash => "backslash",
+ UnicodeEscape { .. } => "unicode escape sequence",
Raw { .. } => "raw text",
Code { .. } => "code block",
Text(_) => "text",
@@ -426,6 +435,25 @@ impl<'s> Tokens<'s> {
}
match self.peek() {
+ Some('u') => {
+ self.eat();
+ if self.peek() == Some('{') {
+ self.eat();
+ let sequence = self.read_string_until(
+ |c| !c.is_ascii_hexdigit(),
+ false, 0, 0,
+ ).0;
+
+ let terminated = self.peek() == Some('}');
+ if terminated {
+ self.eat();
+ }
+
+ UnicodeEscape { sequence, terminated }
+ } else {
+ Text("\\u")
+ }
+ }
Some(c) if is_escapable(c) => {
let index = self.index();
self.eat();
@@ -588,6 +616,7 @@ mod tests {
fn Code<'a>(lang: Option<&'a str>, raw: &'a str, terminated: bool) -> Token<'a> {
Token::Code { lang: lang.map(Spanned::zero), raw, terminated }
}
+ fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } }
macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
@@ -701,14 +730,16 @@ mod tests {
#[test]
fn tokenize_escaped_symbols() {
- t!(Body, r"\\" => T(r"\"));
- t!(Body, r"\[" => T("["));
- t!(Body, r"\]" => T("]"));
- t!(Body, r"\*" => T("*"));
- t!(Body, r"\_" => T("_"));
- t!(Body, r"\`" => T("`"));
- t!(Body, r"\/" => T("/"));
- t!(Body, r#"\""# => T("\""));
+ t!(Body, r"\\" => T(r"\"));
+ t!(Body, r"\[" => T("["));
+ t!(Body, r"\]" => T("]"));
+ t!(Body, r"\*" => T("*"));
+ t!(Body, r"\_" => T("_"));
+ t!(Body, r"\`" => T("`"));
+ t!(Body, r"\/" => T("/"));
+ t!(Body, r"\u{2603}" => UE("2603", true));
+ t!(Body, r"\u{26A4" => UE("26A4", false));
+ t!(Body, r#"\""# => T("\""));
}
#[test]
@@ -716,6 +747,9 @@ mod tests {
t!(Body, r"\a" => T("\\"), T("a"));
t!(Body, r"\:" => T(r"\"), T(":"));
t!(Body, r"\=" => T(r"\"), T("="));
+ t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4"));
+ t!(Body, r"\u{ " => UE("", false), Space(0));
+ t!(Body, r"\u" => T(r"\u"));
t!(Header, r"\\\\" => Invalid(r"\\\\"));
t!(Header, r"\a" => Invalid(r"\a"));
t!(Header, r"\:" => Invalid(r"\"), Colon);