summaryrefslogtreecommitdiff
path: root/src/parse
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2020-09-30 18:59:33 +0200
committerLaurenz <laurmaedje@gmail.com>2020-09-30 18:59:33 +0200
commit4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch)
tree70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse
parent7cc279f7ae122f4c40592004dde89792c636b3c8 (diff)
Refactor raw blocks 💱
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/escaping.rs198
-rw-r--r--src/parse/parser.rs35
-rw-r--r--src/parse/tests.rs60
-rw-r--r--src/parse/tokenizer.rs121
4 files changed, 173 insertions, 241 deletions
diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs
index 55b1fe67..a2ff963b 100644
--- a/src/parse/escaping.rs
+++ b/src/parse/escaping.rs
@@ -1,4 +1,5 @@
use super::is_newline_char;
+use crate::syntax::{Ident, Raw};
/// Resolves all escape sequences in a string.
pub fn unescape_string(string: &str) -> String {
@@ -56,101 +57,60 @@ pub fn unescape_string(string: &str) -> String {
out
}
-/// Resolves all escape sequences in raw markup (between backticks) and splits it into
-/// into lines.
-pub fn unescape_raw(raw: &str) -> Vec<String> {
+/// Resolves the language tag and trims the raw text.
+///
+/// Returns:
+/// - The language tag
+/// - The raw lines
+/// - Whether at least one newline was present in the untrimmed text.
+pub fn process_raw(raw: &str) -> Raw {
+ let (lang, inner) = split_after_lang_tag(raw);
+ let (lines, had_newline) = trim_and_split_raw(inner);
+ Raw { lang, lines, inline: !had_newline }
+}
+
+/// Parse the lang tag and return it alongside the remaining inner raw text.
+fn split_after_lang_tag(raw: &str) -> (Option<Ident>, &str) {
+ let mut lang = String::new();
+
+ let mut inner = raw;
let mut iter = raw.chars();
- let mut text = String::new();
while let Some(c) = iter.next() {
- if c == '\\' {
- if let Some(c) = iter.next() {
- if c != '\\' && c != '`' {
- text.push('\\');
- }
-
- text.push(c);
- } else {
- text.push('\\');
- }
- } else {
- text.push(c);
+ if c == '`' || c.is_whitespace() || is_newline_char(c) {
+ break;
}
+
+ inner = iter.as_str();
+ lang.push(c);
}
- split_lines(&text)
+ (Ident::new(lang), inner)
}
-/// Resolves all escape sequences in code markup (between triple backticks) and splits it
-/// into into lines.
-pub fn unescape_code(raw: &str) -> Vec<String> {
- let mut iter = raw.chars().peekable();
- let mut text = String::new();
- let mut backticks = 0u32;
- let mut update_backtick_count;
-
- while let Some(c) = iter.next() {
- update_backtick_count = true;
-
- if c == '\\' && backticks > 0 {
- let mut tail = String::new();
- let mut escape_success = false;
- let mut backticks_after_slash = 0u32;
-
- while let Some(&s) = iter.peek() {
- match s {
- '\\' => {
- if backticks_after_slash == 0 {
- tail.push('\\');
- } else {
- // Pattern like `\`\` should fail
- // escape and just be printed verbantim.
- break;
- }
- }
- '`' => {
- tail.push(s);
- backticks_after_slash += 1;
- if backticks_after_slash == 2 {
- escape_success = true;
- iter.next();
- break;
- }
- }
- _ => break,
- }
-
- iter.next();
- }
-
- if !escape_success {
- text.push(c);
- backticks = backticks_after_slash;
- update_backtick_count = false;
- } else {
- backticks = 0;
- }
-
- text.push_str(&tail);
- } else {
- text.push(c);
- }
-
- if update_backtick_count {
- if c == '`' {
- backticks += 1;
- } else {
- backticks = 0;
- }
- }
+/// Trims raw text and splits it into lines.
+///
+/// Returns whether at least one newline was contained in `raw`.
+fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
+ // Trims one whitespace at end and start.
+ let raw = raw.strip_prefix(' ').unwrap_or(raw);
+ let raw = raw.strip_suffix(' ').unwrap_or(raw);
+
+ let mut lines = split_lines(raw);
+ let had_newline = lines.len() > 1;
+ let is_whitespace = |line: &String| line.chars().all(char::is_whitespace);
+
+ // Trims a sequence of whitespace followed by a newline at the start.
+ if lines.first().map(is_whitespace).unwrap_or(false) {
+ lines.remove(0);
}
- split_lines(&text)
-}
+ // Trims a newline followed by a sequence of whitespace at the end.
+ if lines.last().map(is_whitespace).unwrap_or(false) {
+ lines.pop();
+ }
-/// Converts a hexademical sequence (without braces or "\u") into a character.
-pub fn hex_to_char(sequence: &str) -> Option<char> {
- u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+ (lines, had_newline)
}
/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks).
@@ -175,12 +135,17 @@ pub fn split_lines(text: &str) -> Vec<String> {
lines
}
+/// Converts a hexademical sequence (without braces or "\u") into a character.
+pub fn hex_to_char(sequence: &str) -> Option<char> {
+ u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
+}
+
#[cfg(test)]
+#[rustfmt::skip]
mod tests {
use super::*;
#[test]
- #[rustfmt::skip]
fn test_unescape_strings() {
fn test(string: &str, expected: &str) {
assert_eq!(unescape_string(string), expected.to_string());
@@ -201,43 +166,48 @@ mod tests {
}
#[test]
- #[rustfmt::skip]
- fn test_unescape_raws() {
+ fn test_split_after_lang_tag() {
+ fn test(raw: &str, lang: Option<&str>, inner: &str) {
+ let (found_lang, found_inner) = split_after_lang_tag(raw);
+ assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang);
+ assert_eq!(found_inner, inner);
+ }
+
+ test("typst it!", Some("typst"), " it!");
+ test("typst\n it!", Some("typst"), "\n it!");
+ test("typst\n it!", Some("typst"), "\n it!");
+ test("abc`", Some("abc"), "`");
+ test(" hi", None, " hi");
+ test("`", None, "`");
+ }
+
+ #[test]
+ fn test_trim_raw() {
fn test(raw: &str, expected: Vec<&str>) {
- assert_eq!(unescape_raw(raw), expected);
+ assert_eq!(trim_and_split_raw(raw).0, expected);
}
- test("raw\\`", vec!["raw`"]);
- test("raw\\\\`", vec!["raw\\`"]);
- test("raw\ntext", vec!["raw", "text"]);
- test("a\r\nb", vec!["a", "b"]);
- test("a\n\nb", vec!["a", "", "b"]);
- test("a\r\x0Bb", vec!["a", "", "b"]);
- test("a\r\n\r\nb", vec!["a", "", "b"]);
- test("raw\\a", vec!["raw\\a"]);
- test("raw\\", vec!["raw\\"]);
+ test(" hi", vec!["hi"]);
+ test(" hi", vec![" hi"]);
+ test("\nhi", vec!["hi"]);
+ test(" \n hi", vec![" hi"]);
+ test("hi ", vec!["hi"]);
+ test("hi ", vec!["hi "]);
+ test("hi\n", vec!["hi"]);
+ test("hi \n ", vec!["hi "]);
+ test(" \n hi \n ", vec![" hi "]);
}
#[test]
- #[rustfmt::skip]
- fn test_unescape_code() {
+ fn test_split_lines() {
fn test(raw: &str, expected: Vec<&str>) {
- assert_eq!(unescape_code(raw), expected);
+ assert_eq!(split_lines(raw), expected);
}
- test("code\\`", vec!["code\\`"]);
- test("code`\\``", vec!["code```"]);
- test("code`\\`a", vec!["code`\\`a"]);
- test("code``hi`\\``", vec!["code``hi```"]);
- test("code`\\\\``", vec!["code`\\``"]);
- test("code`\\`\\`go", vec!["code`\\`\\`go"]);
- test("code`\\`\\``", vec!["code`\\```"]);
- test("code\ntext", vec!["code", "text"]);
- test("a\r\nb", vec!["a", "b"]);
- test("a\n\nb", vec!["a", "", "b"]);
- test("a\r\x0Bb", vec!["a", "", "b"]);
- test("a\r\n\r\nb", vec!["a", "", "b"]);
- test("code\\a", vec!["code\\a"]);
- test("code\\", vec!["code\\"]);
+ test("raw\ntext", vec!["raw", "text"]);
+ test("a\r\nb", vec!["a", "b"]);
+ test("a\n\nb", vec!["a", "", "b"]);
+ test("a\r\x0Bb", vec!["a", "", "b"]);
+ test("a\r\n\r\nb", vec!["a", "", "b"]);
}
}
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
index bbd7ee1d..3446af83 100644
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@@ -99,35 +99,22 @@ impl Parser<'_> {
self.parse_heading().map(SyntaxNode::Heading)
}
- Token::Raw { raw, terminated } => {
+ Token::Raw { raw, backticks, terminated } => {
if !terminated {
- error!(@self.feedback, end, "expected backtick");
+ error!(@self.feedback, end, "expected backtick(s)");
}
- self.with_span(SyntaxNode::Raw(unescape_raw(raw)))
- }
- Token::Code { lang, raw, terminated } => {
- if !terminated {
- error!(@self.feedback, end, "expected backticks");
- }
-
- let lang = lang.and_then(|lang| {
- if let Some(ident) = Ident::new(lang.v) {
- Some(ident.span_with(lang.span))
- } else {
- error!(@self.feedback, lang.span, "invalid identifier");
- None
+ let raw = if backticks > 1 {
+ process_raw(raw)
+ } else {
+ Raw {
+ lang: None,
+ lines: split_lines(raw),
+ inline: true,
}
- });
-
- let mut lines = unescape_code(raw);
- let block = lines.len() > 1;
-
- if lines.last().map(|s| s.is_empty()).unwrap_or(false) {
- lines.pop();
- }
+ };
- self.with_span(SyntaxNode::Code(Code { lang, lines, block }))
+ self.with_span(SyntaxNode::Raw(raw))
}
Token::Text(text) => self.with_span(SyntaxNode::Text(text.to_string())),
diff --git a/src/parse/tests.rs b/src/parse/tests.rs
index d663aa2a..8ddf013d 100644
--- a/src/parse/tests.rs
+++ b/src/parse/tests.rs
@@ -29,24 +29,17 @@ macro_rules! H {
}
macro_rules! R {
- ($($line:expr),* $(,)?) => {
- SyntaxNode::Raw(vec![$($line.to_string()),*])
- };
-}
-
-macro_rules! C {
- ($lang:expr, $($line:expr),* $(,)?) => {{
- let lines = vec![$($line.to_string()) ,*];
- SyntaxNode::Code(Code {
+ ($lang:expr, $inline:expr, $($line:expr),* $(,)?) => {{
+ SyntaxNode::Raw(Raw {
lang: $lang,
- block: lines.len() > 1,
- lines,
+ lines: vec![$($line.to_string()) ,*],
+ inline: $inline,
})
}};
}
-fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<Ident>> {
- Some(Into::<Spanned<&str>>::into(lang).map(|s| Ident(s.to_string())))
+fn Lang(lang: &str) -> Option<Ident> {
+ Some(Ident(lang.to_string()))
}
macro_rules! F {
@@ -220,19 +213,7 @@ fn test_parse_simple_nodes() {
t!("\\u{1f303}" => T("🌃"));
t!("\n\n\nhello" => P, T("hello"));
t!(r"a\ b" => T("a"), L, S, T("b"));
- t!("`py`" => R!["py"]);
- t!("`hi\nyou" => R!["hi", "you"]);
- e!("`hi\nyou" => s(7, 7, "expected backtick"));
- t!("`hi\\`du`" => R!["hi`du"]);
-
- ts!("```java out```" => s(0, 14, C![Lang(s(3, 7, "java")), "out"]));
- t!("``` console.log(\n\"alert\"\n)" => C![None, "console.log(", "\"alert\"", ")"]);
- t!("```typst \r\n Typst uses `\\`` to indicate code blocks" => C![
- Lang("typst"), " Typst uses ``` to indicate code blocks"
- ]);
- e!("``` hi\nyou" => s(10, 10, "expected backticks"));
- e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier"));
e!("\\u{d421c809}" => s(0, 12, "invalid unicode escape sequence"));
e!("\\u{abc" => s(6, 6, "expected closing brace"));
t!("💜\n\n 🌍" => T("💜"), P, T("🌍"));
@@ -243,6 +224,33 @@ fn test_parse_simple_nodes() {
}
#[test]
+fn test_parse_raw() {
+ t!("`py`" => R![None, true, "py"]);
+ t!("`hi\nyou" => R![None, true, "hi", "you"]);
+ t!(r"`` hi\`du``" => R![None, true, r"hi\`du"]);
+
+ // More than one backtick with optional language tag.
+ t!("``` console.log(\n\"alert\"\n)" => R![None, false, "console.log(", "\"alert\"", ")"]);
+ t!("````typst \r\n Typst uses ``` to indicate code blocks````!"
+ => R![Lang("typst"), false, " Typst uses ``` to indicate code blocks"], T("!"));
+
+ // Trimming of whitespace.
+ t!("`` a ``" => R![None, true, "a"]);
+ t!("`` a ``" => R![None, true, "a "]);
+ t!("`` ` ``" => R![None, true, "`"]);
+ t!("``` ` ```" => R![None, true, " ` "]);
+ t!("``` ` \n ```" => R![None, false, " ` "]);
+
+ // Errors.
+ e!("`hi\nyou" => s(7, 7, "expected backtick(s)"));
+ e!("``` hi\nyou" => s(10, 10, "expected backtick(s)"));
+
+ // TODO: Bring back when spans/errors are in place.
+ // ts!("``java out``" => s(0, 12, R![Lang(s(2, 6, "java")), true, "out"]));
+ // e!("```🌍 hi\nyou```" => s(3, 7, "invalid identifier"));
+}
+
+#[test]
fn test_parse_comments() {
// In body.
t!("hi// you\nw" => T("hi"), S, T("w"));
@@ -348,7 +356,7 @@ fn test_parse_function_bodies() {
e!(" [val][ */]" => s(8, 10, "unexpected end of block comment"));
// Raw in body.
- t!("[val][`Hi]`" => F!("val"; Tree![R!["Hi]"]]));
+ t!("[val][`Hi]`" => F!("val"; Tree![R![None, true, "Hi]"]]));
e!("[val][`Hi]`" => s(11, 11, "expected closing bracket"));
// Crazy.
diff --git a/src/parse/tokenizer.rs b/src/parse/tokenizer.rs
index 92d15edc..720bec43 100644
--- a/src/parse/tokenizer.rs
+++ b/src/parse/tokenizer.rs
@@ -56,7 +56,7 @@ impl<'s> Tokens<'s> {
/// The position in the string at which the last token ends and next token
/// will start.
pub fn pos(&self) -> Pos {
- Pos(self.index as u32)
+ self.index.into()
}
}
@@ -111,7 +111,7 @@ impl<'s> Iterator for Tokens<'s> {
// Style toggles.
'_' if self.mode == Body => Underscore,
- '`' if self.mode == Body => self.read_raw_or_code(),
+ '`' if self.mode == Body => self.read_raw(),
// Sections.
'#' if self.mode == Body => Hashtag,
@@ -230,66 +230,31 @@ impl<'s> Tokens<'s> {
Str { string, terminated }
}
- fn read_raw_or_code(&mut self) -> Token<'s> {
- let (raw, terminated) = self.read_until_unescaped('`');
- if raw.is_empty() && terminated && self.peek() == Some('`') {
- // Third tick found; this is a code block.
+ fn read_raw(&mut self) -> Token<'s> {
+ let mut backticks = 1;
+ while self.peek() == Some('`') {
self.eat();
+ backticks += 1;
+ }
- // Reads the lang tag (until newline or whitespace).
- let start = self.pos();
- let (lang, _) = self.read_string_until(false, 0, 0, |c| {
- c == '`' || c.is_whitespace() || is_newline_char(c)
- });
- let end = self.pos();
-
- let lang = if !lang.is_empty() {
- Some(lang.span_with(Span::new(start, end)))
- } else {
- None
- };
-
- // Skip to start of raw contents.
- while let Some(c) = self.peek() {
- if is_newline_char(c) {
- self.eat();
- if c == '\r' && self.peek() == Some('\n') {
- self.eat();
- }
-
- break;
- } else if c.is_whitespace() {
- self.eat();
- } else {
- break;
- }
- }
-
- let start = self.index;
- let mut backticks = 0u32;
+ let start = self.index;
- while backticks < 3 {
- match self.eat() {
- Some('`') => backticks += 1,
- // Escaping of triple backticks.
- Some('\\') if backticks == 1 && self.peek() == Some('`') => {
- backticks = 0;
- }
- Some(_) => {}
- None => break,
- }
+ let mut found = 0;
+ while found < backticks {
+ match self.eat() {
+ Some('`') => found += 1,
+ Some(_) => found = 0,
+ None => break,
}
+ }
- let terminated = backticks == 3;
- let end = self.index - if terminated { 3 } else { 0 };
+ let terminated = found == backticks;
+ let end = self.index - if terminated { found } else { 0 };
- Code {
- lang,
- raw: &self.src[start .. end],
- terminated,
- }
- } else {
- Raw { raw, terminated }
+ Raw {
+ raw: &self.src[start .. end],
+ backticks,
+ terminated,
}
}
@@ -469,18 +434,8 @@ mod tests {
fn Str(string: &str, terminated: bool) -> Token {
Token::Str { string, terminated }
}
- fn Raw(raw: &str, terminated: bool) -> Token {
- Token::Raw { raw, terminated }
- }
- fn Code<'a>(
- lang: Option<Spanned<&'a str>>,
- raw: &'a str,
- terminated: bool,
- ) -> Token<'a> {
- Token::Code { lang, raw, terminated }
- }
- fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<&'a str>> {
- Some(Into::<Spanned<&str>>::into(lang))
+ fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token {
+ Token::Raw { raw, backticks, terminated }
}
fn UE(sequence: &str, terminated: bool) -> Token {
Token::UnicodeEscape { sequence, terminated }
@@ -535,21 +490,33 @@ mod tests {
t!(Body, "***" => Star, Star, Star);
t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star);
t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there"));
- t!(Body, "`raw`" => Raw("raw", true));
t!(Body, "# hi" => Hashtag, S(0), T("hi"));
t!(Body, "#()" => Hashtag, T("()"));
- t!(Body, "`[func]`" => Raw("[func]", true));
- t!(Body, "`]" => Raw("]", false));
- t!(Body, "\\ " => Backslash, S(0));
- t!(Body, "`\\``" => Raw("\\`", true));
- t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false));
- t!(Body, "```rust hi```" => Code(Lang("rust"), "hi", true));
- t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false));
- t!(Body, "```js \r\n document.write(\"go\")" => Code(Lang("js"), " document.write(\"go\")", false));
t!(Header, "_`" => Invalid("_`"));
}
#[test]
+ fn test_tokenize_raw() {
+ // Basics.
+ t!(Body, "`raw`" => Raw("raw", 1, true));
+ t!(Body, "`[func]`" => Raw("[func]", 1, true));
+ t!(Body, "`]" => Raw("]", 1, false));
+ t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false));
+
+ // Language tag.
+ t!(Body, "``` hi```" => Raw(" hi", 3, true));
+ t!(Body, "```rust hi```" => Raw("rust hi", 3, true));
+ t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false));
+ t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true));
+ t!(Body, "```js \r\n document.write(\"go\")`"
+ => Raw("js \r\n document.write(\"go\")`", 3, false));
+
+ // More backticks.
+ t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi"));
+ t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true));
+ }
+
+ #[test]
fn tokenize_header_only_tokens() {
t!(Body, "a: b" => T("a:"), S(0), T("b"));
t!(Body, "c=d, " => T("c=d,"), S(0));