diff options
| author | Laurenz <laurmaedje@gmail.com> | 2020-10-01 01:38:18 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2020-10-01 01:38:18 +0200 |
| commit | 4b9bc660281b2740c310bd9439493064017c9814 (patch) | |
| tree | 609a44b34871c8582dffaf27cbb6636f1a869313 /src/parse/resolve.rs | |
| parent | 38607b8bea1ede7a124c8fe384d7efca76f9f011 (diff) | |
Implement low-level char parser 🥜
Diffstat (limited to 'src/parse/resolve.rs')
| -rw-r--r-- | src/parse/resolve.rs | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs new file mode 100644 index 00000000..422f9385 --- /dev/null +++ b/src/parse/resolve.rs @@ -0,0 +1,191 @@ +//! Resolve strings and raw blocks. + +use super::{is_newline_char, CharParser}; +use crate::syntax::{Ident, Raw}; + +/// Resolves all escape sequences in a string. +pub fn resolve_string(string: &str) -> String { + let mut out = String::with_capacity(string.len()); + let mut p = CharParser::new(string); + + while let Some(c) = p.eat() { + if c != '\\' { + out.push(c); + continue; + } + + let start = p.prev_index(); + match p.eat() { + Some('\\') => out.push('\\'), + Some('"') => out.push('"'), + + Some('n') => out.push('\n'), + Some('t') => out.push('\t'), + Some('u') if p.eat_if('{') => { + // TODO: Feedback if closing brace is missing. + let sequence = p.eat_while(|c| c.is_ascii_hexdigit()); + let _terminated = p.eat_if('}'); + + if let Some(c) = resolve_hex(sequence) { + out.push(c); + } else { + // TODO: Feedback that escape sequence is wrong. + out += p.eaten_from(start); + } + } + + // TODO: Feedback about invalid escape sequence. + _ => out += p.eaten_from(start), + } + } + + out +} + +/// Resolve a hexademical escape sequence (only the inner hex letters without +/// braces or `\u`) into a character. +pub fn resolve_hex(sequence: &str) -> Option<char> { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + +/// Resolves the language tag and trims the raw text. +pub fn resolve_raw(raw: &str, backticks: usize) -> Raw { + if backticks > 1 { + let (tag, inner) = split_at_lang_tag(raw); + let (lines, had_newline) = trim_and_split_raw(inner); + Raw { + lang: Ident::new(tag), + lines, + inline: !had_newline, + } + } else { + Raw { + lang: None, + lines: split_lines(raw), + inline: true, + } + } +} + +/// Parse the lang tag and return it alongside the remaining inner raw text. +fn split_at_lang_tag(raw: &str) -> (&str, &str) { + let mut p = CharParser::new(raw); + ( + p.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)), + p.rest(), + ) +} + +/// Trims raw text and splits it into lines. +/// +/// Returns whether at least one newline was contained in `raw`. +fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) { + // Trims one whitespace at end and start. + let raw = raw.strip_prefix(' ').unwrap_or(raw); + let raw = raw.strip_suffix(' ').unwrap_or(raw); + + let mut lines = split_lines(raw); + let had_newline = lines.len() > 1; + let is_whitespace = |line: &String| line.chars().all(char::is_whitespace); + + // Trims a sequence of whitespace followed by a newline at the start. + if lines.first().map(is_whitespace).unwrap_or(false) { + lines.remove(0); + } + + // Trims a newline followed by a sequence of whitespace at the end. + if lines.last().map(is_whitespace).unwrap_or(false) { + lines.pop(); + } + + (lines, had_newline) +} + +/// Splits a string into a vector of lines (respecting Unicode & Windows line +/// breaks). +pub fn split_lines(text: &str) -> Vec<String> { + let mut p = CharParser::new(text); + let mut line = String::new(); + let mut lines = Vec::new(); + + while let Some(c) = p.eat_merging_crlf() { + if is_newline_char(c) { + lines.push(std::mem::take(&mut line)); + } else { + line.push(c); + } + } + + lines.push(line); + lines +} + +#[cfg(test)] +#[rustfmt::skip] +mod tests { + use super::*; + + #[test] + fn test_unescape_strings() { + fn test(string: &str, expected: &str) { + assert_eq!(resolve_string(string), expected.to_string()); + } + + test(r#"hello world"#, "hello world"); + test(r#"hello\nworld"#, "hello\nworld"); + test(r#"a\"bc"#, "a\"bc"); + test(r#"a\u{2603}bc"#, "a☃bc"); + test(r#"a\u{26c3bg"#, "a𦰻g"); + test(r#"av\u{6797"#, "av林"); + test(r#"a\\"#, "a\\"); + test(r#"a\\\nbc"#, "a\\\nbc"); + test(r#"a\tbc"#, "a\tbc"); + test(r"🌎", "🌎"); + test(r"🌎\", r"🌎\"); + test(r"\🌎", r"\🌎"); + } + + #[test] + fn test_split_at_lang_tag() { + fn test(raw: &str, lang: &str, inner: &str) { + assert_eq!(split_at_lang_tag(raw), (lang, inner)); + } + + test("typst it!", "typst", " it!"); + test("typst\n it!", "typst", "\n it!"); + test("typst\n it!", "typst", "\n it!"); + test("abc`", "abc", "`"); + test(" hi", "", " hi"); + test("`", "", "`"); + } + + #[test] + fn test_trim_raw() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(trim_and_split_raw(raw).0, expected); + } + + test(" hi", vec!["hi"]); + test(" hi", vec![" hi"]); + test("\nhi", vec!["hi"]); + test(" \n hi", vec![" hi"]); + test("hi ", vec!["hi"]); + test("hi ", vec!["hi "]); + test("hi\n", vec!["hi"]); + test("hi \n ", vec!["hi "]); + test(" \n hi \n ", vec![" hi "]); + } + + #[test] + fn test_split_lines() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(split_lines(raw), expected); + } + + test("raw\ntext", vec!["raw", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + } +} |
