diff options
| author | Laurenz <laurmaedje@gmail.com> | 2020-09-30 18:59:33 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2020-09-30 18:59:33 +0200 |
| commit | 4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch) | |
| tree | 70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse | |
| parent | 7cc279f7ae122f4c40592004dde89792c636b3c8 (diff) | |
Refactor raw blocks 💱
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/escaping.rs | 198 | ||||
| -rw-r--r-- | src/parse/parser.rs | 35 | ||||
| -rw-r--r-- | src/parse/tests.rs | 60 | ||||
| -rw-r--r-- | src/parse/tokenizer.rs | 121 |
4 files changed, 173 insertions, 241 deletions
diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs index 55b1fe67..a2ff963b 100644 --- a/src/parse/escaping.rs +++ b/src/parse/escaping.rs @@ -1,4 +1,5 @@ use super::is_newline_char; +use crate::syntax::{Ident, Raw}; /// Resolves all escape sequences in a string. pub fn unescape_string(string: &str) -> String { @@ -56,101 +57,60 @@ pub fn unescape_string(string: &str) -> String { out } -/// Resolves all escape sequences in raw markup (between backticks) and splits it into -/// into lines. -pub fn unescape_raw(raw: &str) -> Vec<String> { +/// Resolves the language tag and trims the raw text. +/// +/// Returns: +/// - The language tag +/// - The raw lines +/// - Whether at least one newline was present in the untrimmed text. +pub fn process_raw(raw: &str) -> Raw { + let (lang, inner) = split_after_lang_tag(raw); + let (lines, had_newline) = trim_and_split_raw(inner); + Raw { lang, lines, inline: !had_newline } +} + +/// Parse the lang tag and return it alongside the remaining inner raw text. +fn split_after_lang_tag(raw: &str) -> (Option<Ident>, &str) { + let mut lang = String::new(); + + let mut inner = raw; let mut iter = raw.chars(); - let mut text = String::new(); while let Some(c) = iter.next() { - if c == '\\' { - if let Some(c) = iter.next() { - if c != '\\' && c != '`' { - text.push('\\'); - } - - text.push(c); - } else { - text.push('\\'); - } - } else { - text.push(c); + if c == '`' || c.is_whitespace() || is_newline_char(c) { + break; } + + inner = iter.as_str(); + lang.push(c); } - split_lines(&text) + (Ident::new(lang), inner) } -/// Resolves all escape sequences in code markup (between triple backticks) and splits it -/// into into lines. -pub fn unescape_code(raw: &str) -> Vec<String> { - let mut iter = raw.chars().peekable(); - let mut text = String::new(); - let mut backticks = 0u32; - let mut update_backtick_count; - - while let Some(c) = iter.next() { - update_backtick_count = true; - - if c == '\\' && backticks > 0 { - let mut tail = String::new(); - let mut escape_success = false; - let mut backticks_after_slash = 0u32; - - while let Some(&s) = iter.peek() { - match s { - '\\' => { - if backticks_after_slash == 0 { - tail.push('\\'); - } else { - // Pattern like `\`\` should fail - // escape and just be printed verbantim. - break; - } - } - '`' => { - tail.push(s); - backticks_after_slash += 1; - if backticks_after_slash == 2 { - escape_success = true; - iter.next(); - break; - } - } - _ => break, - } - - iter.next(); - } - - if !escape_success { - text.push(c); - backticks = backticks_after_slash; - update_backtick_count = false; - } else { - backticks = 0; - } - - text.push_str(&tail); - } else { - text.push(c); - } - - if update_backtick_count { - if c == '`' { - backticks += 1; - } else { - backticks = 0; - } - } +/// Trims raw text and splits it into lines. +/// +/// Returns whether at least one newline was contained in `raw`. +fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) { + // Trims one whitespace at end and start. + let raw = raw.strip_prefix(' ').unwrap_or(raw); + let raw = raw.strip_suffix(' ').unwrap_or(raw); + + let mut lines = split_lines(raw); + let had_newline = lines.len() > 1; + let is_whitespace = |line: &String| line.chars().all(char::is_whitespace); + + // Trims a sequence of whitespace followed by a newline at the start. + if lines.first().map(is_whitespace).unwrap_or(false) { + lines.remove(0); } - split_lines(&text) -} + // Trims a newline followed by a sequence of whitespace at the end. + if lines.last().map(is_whitespace).unwrap_or(false) { + lines.pop(); + } -/// Converts a hexademical sequence (without braces or "\u") into a character. -pub fn hex_to_char(sequence: &str) -> Option<char> { - u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) + (lines, had_newline) } /// Splits a string into a vector of lines (respecting Unicode & Windows line breaks). @@ -175,12 +135,17 @@ pub fn split_lines(text: &str) -> Vec<String> { lines } +/// Converts a hexademical sequence (without braces or "\u") into a character. +pub fn hex_to_char(sequence: &str) -> Option<char> { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + #[cfg(test)] +#[rustfmt::skip] mod tests { use super::*; #[test] - #[rustfmt::skip] fn test_unescape_strings() { fn test(string: &str, expected: &str) { assert_eq!(unescape_string(string), expected.to_string()); @@ -201,43 +166,48 @@ mod tests { } #[test] - #[rustfmt::skip] - fn test_unescape_raws() { + fn test_split_after_lang_tag() { + fn test(raw: &str, lang: Option<&str>, inner: &str) { + let (found_lang, found_inner) = split_after_lang_tag(raw); + assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang); + assert_eq!(found_inner, inner); + } + + test("typst it!", Some("typst"), " it!"); + test("typst\n it!", Some("typst"), "\n it!"); + test("typst\n it!", Some("typst"), "\n it!"); + test("abc`", Some("abc"), "`"); + test(" hi", None, " hi"); + test("`", None, "`"); + } + + #[test] + fn test_trim_raw() { fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(unescape_raw(raw), expected); + assert_eq!(trim_and_split_raw(raw).0, expected); } - test("raw\\`", vec!["raw`"]); - test("raw\\\\`", vec!["raw\\`"]); - test("raw\ntext", vec!["raw", "text"]); - test("a\r\nb", vec!["a", "b"]); - test("a\n\nb", vec!["a", "", "b"]); - test("a\r\x0Bb", vec!["a", "", "b"]); - test("a\r\n\r\nb", vec!["a", "", "b"]); - test("raw\\a", vec!["raw\\a"]); - test("raw\\", vec!["raw\\"]); + test(" hi", vec!["hi"]); + test(" hi", vec![" hi"]); + test("\nhi", vec!["hi"]); + test(" \n hi", vec![" hi"]); + test("hi ", vec!["hi"]); + test("hi ", vec!["hi "]); + test("hi\n", vec!["hi"]); + test("hi \n ", vec!["hi "]); + test(" \n hi \n ", vec![" hi "]); } #[test] - #[rustfmt::skip] - fn test_unescape_code() { + fn test_split_lines() { fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(unescape_code(raw), expected); + assert_eq!(split_lines(raw), expected); } - test("code\\`", vec!["code\\`"]); - test("code`\\``", vec!["code```"]); - test("code`\\`a", vec!["code`\\`a"]); - test("code``hi`\\``", vec!["code``hi```"]); - test("code`\\\\``", vec!["code`\\``"]); - test("code`\\`\\`go", vec!["code`\\`\\`go"]); - test("code`\\`\\``", vec!["code`\\```"]); - test("code\ntext", vec!["code", "text"]); - test("a\r\nb", vec!["a", "b"]); - test("a\n\nb", vec!["a", "", "b"]); - test("a\r\x0Bb", vec!["a", "", "b"]); - test("a\r\n\r\nb", vec!["a", "", "b"]); - test("code\\a", vec!["code\\a"]); - test("code\\", vec!["code\\"]); + test("raw\ntext", vec!["raw", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); } } diff --git a/src/parse/parser.rs b/src/parse/parser.rs index bbd7ee1d..3446af83 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -99,35 +99,22 @@ impl Parser<'_> { self.parse_heading().map(SyntaxNode::Heading) } - Token::Raw { raw, terminated } => { + Token::Raw { raw, backticks, terminated } => { if !terminated { - error!(@self.feedback, end, "expected backtick"); + error!(@self.feedback, end, "expected backtick(s)"); } - self.with_span(SyntaxNode::Raw(unescape_raw(raw))) - } - Token::Code { lang, raw, terminated } => { - if !terminated { - error!(@self.feedback, end, "expected backticks"); - } - - let lang = lang.and_then(|lang| { - if let Some(ident) = Ident::new(lang.v) { - Some(ident.span_with(lang.span)) - } else { - error!(@self.feedback, lang.span, "invalid identifier"); - None + let raw = if backticks > 1 { + process_raw(raw) + } else { + Raw { + lang: None, + lines: split_lines(raw), + inline: true, } - }); - - let mut lines = unescape_code(raw); - let block = lines.len() > 1; - - if lines.last().map(|s| s.is_empty()).unwrap_or(false) { - lines.pop(); - } + }; - self.with_span(SyntaxNode::Code(Code { lang, lines, block })) + self.with_span(SyntaxNode::Raw(raw)) } Token::Text(text) => self.with_span(SyntaxNode::Text(text.to_string())), diff --git a/src/parse/tests.rs b/src/parse/tests.rs index d663aa2a..8ddf013d 100644 --- a/src/parse/tests.rs +++ b/src/parse/tests.rs @@ -29,24 +29,17 @@ macro_rules! H { } macro_rules! R { - ($($line:expr),* $(,)?) => { - SyntaxNode::Raw(vec![$($line.to_string()),*]) - }; -} - -macro_rules! C { - ($lang:expr, $($line:expr),* $(,)?) => {{ - let lines = vec![$($line.to_string()) ,*]; - SyntaxNode::Code(Code { + ($lang:expr, $inline:expr, $($line:expr),* $(,)?) => {{ + SyntaxNode::Raw(Raw { lang: $lang, - block: lines.len() > 1, - lines, + lines: vec![$($line.to_string()) ,*], + inline: $inline, }) }}; } -fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<Ident>> { - Some(Into::<Spanned<&str>>::into(lang).map(|s| Ident(s.to_string()))) +fn Lang(lang: &str) -> Option<Ident> { + Some(Ident(lang.to_string())) } macro_rules! F { @@ -220,19 +213,7 @@ fn test_parse_simple_nodes() { t!("\\u{1f303}" => T("🌃")); t!("\n\n\nhello" => P, T("hello")); t!(r"a\ b" => T("a"), L, S, T("b")); - t!("`py`" => R!["py"]); - t!("`hi\nyou" => R!["hi", "you"]); - e!("`hi\nyou" => s(7, 7, "expected backtick")); - t!("`hi\\`du`" => R!["hi`du"]); - - ts!("```java out```" => s(0, 14, C![Lang(s(3, 7, "java")), "out"])); - t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]); - t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![ - Lang("typst"), " Typst uses ``` to indicate code blocks" - ]); - e!("``` hi\nyou" => s(10, 10, "expected backticks")); - e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier")); e!("\\u{d421c809}" => s(0, 12, "invalid unicode escape sequence")); e!("\\u{abc" => s(6, 6, "expected closing brace")); t!("💜\n\n 🌍" => T("💜"), P, T("🌍")); @@ -243,6 +224,33 @@ fn test_parse_simple_nodes() { } #[test] +fn test_parse_raw() { + t!("`py`" => R![None, true, "py"]); + t!("`hi\nyou" => R![None, true, "hi", "you"]); + t!(r"`` hi\`du``" => R![None, true, r"hi\`du"]); + + // More than one backtick with optional language tag. + t!("``` console.log(\n\"alert\"\n)" => R![None, false, "console.log(", "\"alert\"", ")"]); + t!("````typst \r\n Typst uses ``` to indicate code blocks````!" + => R![Lang("typst"), false, " Typst uses ``` to indicate code blocks"], T("!")); + + // Trimming of whitespace. + t!("`` a ``" => R![None, true, "a"]); + t!("`` a ``" => R![None, true, "a "]); + t!("`` ` ``" => R![None, true, "`"]); + t!("``` ` ```" => R![None, true, " ` "]); + t!("``` ` \n ```" => R![None, false, " ` "]); + + // Errors. + e!("`hi\nyou" => s(7, 7, "expected backtick(s)")); + e!("``` hi\nyou" => s(10, 10, "expected backtick(s)")); + + // TODO: Bring back when spans/errors are in place. + // ts!("``java out``" => s(0, 12, R![Lang(s(2, 6, "java")), true, "out"])); + // e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier")); +} + +#[test] fn test_parse_comments() { // In body. t!("hi// you\nw" => T("hi"), S, T("w")); @@ -348,7 +356,7 @@ fn test_parse_function_bodies() { e!(" [val][ */]" => s(8, 10, "unexpected end of block comment")); // Raw in body. - t!("[val][`Hi]`" => F!("val"; Tree![R!["Hi]"]])); + t!("[val][`Hi]`" => F!("val"; Tree![R![None, true, "Hi]"]])); e!("[val][`Hi]`" => s(11, 11, "expected closing bracket")); // Crazy. diff --git a/src/parse/tokenizer.rs b/src/parse/tokenizer.rs index 92d15edc..720bec43 100644 --- a/src/parse/tokenizer.rs +++ b/src/parse/tokenizer.rs @@ -56,7 +56,7 @@ impl<'s> Tokens<'s> { /// The position in the string at which the last token ends and next token /// will start. pub fn pos(&self) -> Pos { - Pos(self.index as u32) + self.index.into() } } @@ -111,7 +111,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw_or_code(), + '`' if self.mode == Body => self.read_raw(), // Sections. '#' if self.mode == Body => Hashtag, @@ -230,66 +230,31 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw_or_code(&mut self) -> Token<'s> { - let (raw, terminated) = self.read_until_unescaped('`'); - if raw.is_empty() && terminated && self.peek() == Some('`') { - // Third tick found; this is a code block. + fn read_raw(&mut self) -> Token<'s> { + let mut backticks = 1; + while self.peek() == Some('`') { self.eat(); + backticks += 1; + } - // Reads the lang tag (until newline or whitespace). - let start = self.pos(); - let (lang, _) = self.read_string_until(false, 0, 0, |c| { - c == '`' || c.is_whitespace() || is_newline_char(c) - }); - let end = self.pos(); - - let lang = if !lang.is_empty() { - Some(lang.span_with(Span::new(start, end))) - } else { - None - }; - - // Skip to start of raw contents. - while let Some(c) = self.peek() { - if is_newline_char(c) { - self.eat(); - if c == '\r' && self.peek() == Some('\n') { - self.eat(); - } - - break; - } else if c.is_whitespace() { - self.eat(); - } else { - break; - } - } - - let start = self.index; - let mut backticks = 0u32; + let start = self.index; - while backticks < 3 { - match self.eat() { - Some('`') => backticks += 1, - // Escaping of triple backticks. - Some('\\') if backticks == 1 && self.peek() == Some('`') => { - backticks = 0; - } - Some(_) => {} - None => break, - } + let mut found = 0; + while found < backticks { + match self.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, } + } - let terminated = backticks == 3; - let end = self.index - if terminated { 3 } else { 0 }; + let terminated = found == backticks; + let end = self.index - if terminated { found } else { 0 }; - Code { - lang, - raw: &self.src[start .. end], - terminated, - } - } else { - Raw { raw, terminated } + Raw { + raw: &self.src[start .. end], + backticks, + terminated, } } @@ -469,18 +434,8 @@ mod tests { fn Str(string: &str, terminated: bool) -> Token { Token::Str { string, terminated } } - fn Raw(raw: &str, terminated: bool) -> Token { - Token::Raw { raw, terminated } - } - fn Code<'a>( - lang: Option<Spanned<&'a str>>, - raw: &'a str, - terminated: bool, - ) -> Token<'a> { - Token::Code { lang, raw, terminated } - } - fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<&'a str>> { - Some(Into::<Spanned<&str>>::into(lang)) + fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { + Token::Raw { raw, backticks, terminated } } fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } @@ -535,21 +490,33 @@ mod tests { t!(Body, "***" => Star, Star, Star); t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star); t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there")); - t!(Body, "`raw`" => Raw("raw", true)); t!(Body, "# hi" => Hashtag, S(0), T("hi")); t!(Body, "#()" => Hashtag, T("()")); - t!(Body, "`[func]`" => Raw("[func]", true)); - t!(Body, "`]" => Raw("]", false)); - t!(Body, "\\ " => Backslash, S(0)); - t!(Body, "`\\``" => Raw("\\`", true)); - t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false)); - t!(Body, "```rust hi```" => Code(Lang("rust"), "hi", true)); - t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false)); - t!(Body, "```js \r\n document.write(\"go\")" => Code(Lang("js"), " document.write(\"go\")", false)); t!(Header, "_`" => Invalid("_`")); } #[test] + fn test_tokenize_raw() { + // Basics. + t!(Body, "`raw`" => Raw("raw", 1, true)); + t!(Body, "`[func]`" => Raw("[func]", 1, true)); + t!(Body, "`]" => Raw("]", 1, false)); + t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); + + // Language tag. + t!(Body, "``` hi```" => Raw(" hi", 3, true)); + t!(Body, "```rust hi```" => Raw("rust hi", 3, true)); + t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false)); + t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true)); + t!(Body, "```js \r\n document.write(\"go\")`" + => Raw("js \r\n document.write(\"go\")`", 3, false)); + + // More backticks. + t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi")); + t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true)); + } + + #[test] fn tokenize_header_only_tokens() { t!(Body, "a: b" => T("a:"), S(0), T("b")); t!(Body, "c=d, " => T("c=d,"), S(0)); |
