From bc1b4216a802d09e8d00dd277a0e204d49bcaa7f Mon Sep 17 00:00:00 2001 From: Laurenz Date: Wed, 30 Sep 2020 12:38:02 +0200 Subject: =?UTF-8?q?Reorganize=20syntax=20types=20into=20two=20modules=20?= =?UTF-8?q?=F0=9F=93=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parse/escaping.rs | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 src/parse/escaping.rs (limited to 'src/parse/escaping.rs') diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs new file mode 100644 index 00000000..55b1fe67 --- /dev/null +++ b/src/parse/escaping.rs @@ -0,0 +1,243 @@ +use super::is_newline_char; + +/// Resolves all escape sequences in a string. +pub fn unescape_string(string: &str) -> String { + let mut iter = string.chars().peekable(); + let mut out = String::with_capacity(string.len()); + + while let Some(c) = iter.next() { + if c == '\\' { + match iter.next() { + Some('\\') => out.push('\\'), + Some('"') => out.push('"'), + Some('u') if iter.peek() == Some(&'{') => { + iter.next(); + + let mut sequence = String::new(); + let terminated = loop { + match iter.peek() { + // TODO: Feedback that closing brace is missing. + Some('}') => { + iter.next(); + break true; + } + Some(&c) if c.is_ascii_hexdigit() => { + iter.next(); + sequence.push(c); + } + _ => break false, + } + }; + + // TODO: Feedback that escape sequence is wrong. + if let Some(c) = hex_to_char(&sequence) { + out.push(c); + } else { + out.push_str("\\u{"); + out.push_str(&sequence); + if terminated { + out.push('}'); + } + } + } + Some('n') => out.push('\n'), + Some('t') => out.push('\t'), + Some(c) => { + out.push('\\'); + out.push(c); + } + None => out.push('\\'), + } + } else { + out.push(c); + } + } + + out +} + +/// Resolves all escape sequences in raw markup (between backticks) and splits it into +/// into lines. +pub fn unescape_raw(raw: &str) -> Vec { + let mut iter = raw.chars(); + let mut text = String::new(); + + while let Some(c) = iter.next() { + if c == '\\' { + if let Some(c) = iter.next() { + if c != '\\' && c != '`' { + text.push('\\'); + } + + text.push(c); + } else { + text.push('\\'); + } + } else { + text.push(c); + } + } + + split_lines(&text) +} + +/// Resolves all escape sequences in code markup (between triple backticks) and splits it +/// into into lines. +pub fn unescape_code(raw: &str) -> Vec { + let mut iter = raw.chars().peekable(); + let mut text = String::new(); + let mut backticks = 0u32; + let mut update_backtick_count; + + while let Some(c) = iter.next() { + update_backtick_count = true; + + if c == '\\' && backticks > 0 { + let mut tail = String::new(); + let mut escape_success = false; + let mut backticks_after_slash = 0u32; + + while let Some(&s) = iter.peek() { + match s { + '\\' => { + if backticks_after_slash == 0 { + tail.push('\\'); + } else { + // Pattern like `\`\` should fail + // escape and just be printed verbantim. + break; + } + } + '`' => { + tail.push(s); + backticks_after_slash += 1; + if backticks_after_slash == 2 { + escape_success = true; + iter.next(); + break; + } + } + _ => break, + } + + iter.next(); + } + + if !escape_success { + text.push(c); + backticks = backticks_after_slash; + update_backtick_count = false; + } else { + backticks = 0; + } + + text.push_str(&tail); + } else { + text.push(c); + } + + if update_backtick_count { + if c == '`' { + backticks += 1; + } else { + backticks = 0; + } + } + } + + split_lines(&text) +} + +/// Converts a hexademical sequence (without braces or "\u") into a character. +pub fn hex_to_char(sequence: &str) -> Option { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + +/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks). +pub fn split_lines(text: &str) -> Vec { + let mut iter = text.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + + while let Some(c) = iter.next() { + if is_newline_char(c) { + if c == '\r' && iter.peek() == Some(&'\n') { + iter.next(); + } + + lines.push(std::mem::take(&mut line)); + } else { + line.push(c); + } + } + + lines.push(line); + lines +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + #[rustfmt::skip] + fn test_unescape_strings() { + fn test(string: &str, expected: &str) { + assert_eq!(unescape_string(string), expected.to_string()); + } + + test(r#"hello world"#, "hello world"); + test(r#"hello\nworld"#, "hello\nworld"); + test(r#"a\"bc"#, "a\"bc"); + test(r#"a\u{2603}bc"#, "a☃bc"); + test(r#"a\u{26c3bg"#, "a𦰻g"); + test(r#"av\u{6797"#, "av林"); + test(r#"a\\"#, "a\\"); + test(r#"a\\\nbc"#, "a\\\nbc"); + test(r#"a\tbc"#, "a\tbc"); + test(r"🌎", "🌎"); + test(r"🌎\", r"🌎\"); + test(r"\🌎", r"\🌎"); + } + + #[test] + #[rustfmt::skip] + fn test_unescape_raws() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(unescape_raw(raw), expected); + } + + test("raw\\`", vec!["raw`"]); + test("raw\\\\`", vec!["raw\\`"]); + test("raw\ntext", vec!["raw", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + test("raw\\a", vec!["raw\\a"]); + test("raw\\", vec!["raw\\"]); + } + + #[test] + #[rustfmt::skip] + fn test_unescape_code() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(unescape_code(raw), expected); + } + + test("code\\`", vec!["code\\`"]); + test("code`\\``", vec!["code```"]); + test("code`\\`a", vec!["code`\\`a"]); + test("code``hi`\\``", vec!["code``hi```"]); + test("code`\\\\``", vec!["code`\\``"]); + test("code`\\`\\`go", vec!["code`\\`\\`go"]); + test("code`\\`\\``", vec!["code`\\```"]); + test("code\ntext", vec!["code", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + test("code\\a", vec!["code\\a"]); + test("code\\", vec!["code\\"]); + } +} -- cgit v1.2.3