New syntax features 👔

- Forced line breaks with backslash followed by whitespace - (Multline) raw text in backticks - Set font class fallbacks with [font.family] (e.g. [font.family: monospace=("CMU Typewriter Text")]) - More sophisticated procedure to find end of function, which accounts for comments, strings, raw text and nested functions (this is a mix of a feature and a bug fix)
author: Laurenz <laurmaedje@gmail.com> 2020-02-13 21:58:49 +0100
committer: Laurenz <laurmaedje@gmail.com> 2020-02-13 21:58:49 +0100
commit: 1658b00282b631fb5218f477ea7f45f925644cea (patch)
tree: 13a0f51f335e687c363a5afb9cc73993a69489cc /src/syntax/tokens.rs
parent: 60099aed50b89daef29543c4700470e566c48798 (diff)
1 files changed, 107 insertions, 42 deletions
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index d78938e3..cc65d993 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -83,8 +83,17 @@ pub enum Token<'s> {
     Star,
     /// An underscore in body-text.
     Underscore,
-    /// A backtick in body-text.
-    Backtick,
+
+    /// A backslash followed by whitespace in text.
+    Backslash,
+
+    /// Raw text.
+    Raw {
+        /// The raw text (not yet unescaped as for strings).
+        raw: &'s str,
+        /// Whether the closing backtick was present.
+        terminated: bool,
+    },
 
     /// Any other consecutive string.
     Text(&'s str),
@@ -115,8 +124,9 @@ impl<'s> Token<'s> {
             ExprBool(_)     => "bool",
             Star            => "star",
             Underscore      => "underscore",
-            Backtick        => "backtick",
-            Text(_)         => "invalid identifier",
+            Backslash       => "backslash",
+            Raw { .. }      => "raw text",
+            Text(_)         => "text",
             Invalid("]")    => "closing bracket",
             Invalid("*/")   => "end of block comment",
             Invalid(_)      => "invalid token",
@@ -206,7 +216,7 @@ impl<'s> Iterator for Tokens<'s> {
             // Style toggles.
             '*' if self.mode == Body => Star,
             '_' if self.mode == Body => Underscore,
-            '`' if self.mode == Body => Backtick,
+            '`' if self.mode == Body => self.parse_raw(),
 
             // An escaped thing.
             '\\' if self.mode == Body => self.parse_escaped(),
@@ -281,7 +291,7 @@ impl<'s> Tokens<'s> {
     }
 
     fn parse_function(&mut self, start: Position) -> Token<'s> {
-        let (header, terminated) = self.read_function_part();
+        let (header, terminated) = self.read_function_part(Header);
         self.eat();
 
         if self.peek() != Some('[') {
@@ -291,7 +301,7 @@ impl<'s> Tokens<'s> {
         self.eat();
 
         let body_start = self.pos() - start;
-        let (body, terminated) = self.read_function_part();
+        let (body, terminated) = self.read_function_part(Body);
         let body_end = self.pos() - start;
         let span = Span::new(body_start, body_end);
 
@@ -300,60 +310,73 @@ impl<'s> Tokens<'s> {
         Function { header, body: Some(Spanned { v: body, span }), terminated }
     }
 
-    fn read_function_part(&mut self) -> (&'s str, bool) {
-        let mut escaped = false;
-        let mut in_string = false;
-        let mut depth = 0;
+    fn read_function_part(&mut self, mode: TokenizationMode) -> (&'s str, bool) {
+        let start = self.index();
+        let mut terminated = false;
 
-        self.read_string_until(|n| {
+        while let Some(n) = self.peek() {
+            if n == ']' {
+                terminated = true;
+                break;
+            }
+
+            self.eat();
             match n {
-                '"' if !escaped => in_string = !in_string,
-                '[' if !escaped && !in_string => depth += 1,
-                ']' if !escaped && !in_string => {
-                    if depth == 0 {
-                        return true;
-                    } else {
-                        depth -= 1;
-                    }
-                }
-                '\\' => escaped = !escaped,
-                _ => escaped = false,
+                '[' => { self.parse_function(Position::ZERO); }
+                '/' if self.peek() == Some('/') => { self.parse_line_comment(); }
+                '/' if self.peek() == Some('*') => { self.parse_block_comment(); }
+                '"' if mode == Header => { self.parse_string(); }
+                '`' if mode == Body => { self.parse_raw(); }
+                '\\' => { self.eat(); }
+                _ => {}
             }
+        }
 
-            false
-        }, false, 0, 0)
+        let end = self.index();
+        (&self.src[start .. end], terminated)
     }
 
     fn parse_string(&mut self) -> Token<'s> {
+        let (string, terminated) = self.read_until_unescaped('"');
+        ExprStr { string, terminated }
+    }
+
+    fn parse_raw(&mut self) -> Token<'s> {
+        let (raw, terminated) = self.read_until_unescaped('`');
+        Raw { raw, terminated }
+    }
+
+    fn read_until_unescaped(&mut self, c: char) -> (&'s str, bool) {
         let mut escaped = false;
-        let (string, terminated) = self.read_string_until(|n| {
+        self.read_string_until(|n| {
             match n {
-                '"' if !escaped => return true,
+                n if n == c && !escaped => return true,
                 '\\' => escaped = !escaped,
                 _ => escaped = false,
             }
 
             false
-        }, true, 0, -1);
-        ExprStr { string, terminated }
+        }, true, 0, -1)
     }
 
     fn parse_escaped(&mut self) -> Token<'s> {
         fn is_escapable(c: char) -> bool {
             match c {
-                '[' | ']' | '\\' | '/' | '*' | '_' | '`' => true,
+                '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' => true,
                 _ => false,
             }
         }
 
-        Text(match self.peek() {
+        match self.peek() {
             Some(c) if is_escapable(c) => {
                 let index = self.index();
                 self.eat();
-                &self.src[index .. index + c.len_utf8()]
+                Text(&self.src[index .. index + c.len_utf8()])
             }
-            _ => "\\"
-        })
+            Some(c) if c.is_whitespace() => Backslash,
+            Some(_) => Text("\\"),
+            None => Backslash,
+        }
     }
 
     fn parse_expr(&mut self, text: &'s str) -> Token<'s> {
@@ -462,6 +485,7 @@ pub fn is_identifier(string: &str) -> bool {
     true
 }
 
+
 #[cfg(test)]
 mod tests {
     use super::super::test::check;
@@ -483,6 +507,11 @@ mod tests {
         Token::ExprStr { string, terminated }
     }
 
+    #[allow(non_snake_case)]
+    fn Raw(raw: &'static str, terminated: bool) -> Token<'static> {
+        Token::Raw { raw, terminated }
+    }
+
     /// Test whether the given string tokenizes into the given list of tokens.
     macro_rules! t {
         ($mode:expr, $source:expr => [$($tokens:tt)*]) => {
@@ -540,10 +569,15 @@ mod tests {
 
     #[test]
     fn tokenize_body_only_tokens() {
-        t!(Body, "_*`"           => [Underscore, Star, Backtick]);
+        t!(Body, "_*"            => [Underscore, Star]);
         t!(Body, "***"           => [Star, Star, Star]);
         t!(Body, "[func]*bold*"  => [func!("func", None, true), Star, T("bold"), Star]);
         t!(Body, "hi_you_ there" => [T("hi"), Underscore, T("you"), Underscore, S(0), T("there")]);
+        t!(Body, "`raw`"         => [Raw("raw", true)]);
+        t!(Body, "`[func]`"      => [Raw("[func]", true)]);
+        t!(Body, "`]"            => [Raw("]", false)]);
+        t!(Body, "`\\``"         => [Raw("\\`", true)]);
+        t!(Body, "\\ "           => [Backslash, S(0)]);
         t!(Header, "_*`"         => [Invalid("_*`")]);
     }
 
@@ -599,14 +633,45 @@ mod tests {
     }
 
     #[test]
+    fn tokenize_correct_end_of_function() {
+        // End of function with strings and carets in headers
+        t!(Body, r#"[f: "]"#      => [func!(r#"f: "]"#, None, false)]);
+        t!(Body, "[f: \"s\"]"     => [func!("f: \"s\"", None, true)]);
+        t!(Body, r#"[f: \"\"\"]"# => [func!(r#"f: \"\"\""#, None, true)]);
+        t!(Body, "[f: `]"         => [func!("f: `", None, true)]);
+
+        // End of function with strings and carets in bodies
+        t!(Body, "[f][\"]"        => [func!("f", Some((0:4, 0:5, "\"")), true)]);
+        t!(Body, r#"[f][\"]"#     => [func!("f", Some((0:4, 0:6, r#"\""#)), true)]);
+        t!(Body, "[f][`]"         => [func!("f", Some((0:4, 0:6, "`]")), false)]);
+        t!(Body, "[f][\\`]"       => [func!("f", Some((0:4, 0:6, "\\`")), true)]);
+        t!(Body, "[f][`raw`]"     => [func!("f", Some((0:4, 0:9, "`raw`")), true)]);
+        t!(Body, "[f][`raw]"      => [func!("f", Some((0:4, 0:9, "`raw]")), false)]);
+        t!(Body, "[f][`raw]`]"    => [func!("f", Some((0:4, 0:10, "`raw]`")), true)]);
+        t!(Body, "[f][`\\`]"      => [func!("f", Some((0:4, 0:8, "`\\`]")), false)]);
+        t!(Body, "[f][`\\\\`]"    => [func!("f", Some((0:4, 0:8, "`\\\\`")), true)]);
+
+        // End of function with comments
+        t!(Body, "[f][/*]"        => [func!("f", Some((0:4, 0:7, "/*]")), false)]);
+        t!(Body, "[f][/*`*/]"     => [func!("f", Some((0:4, 0:9, "/*`*/")), true)]);
+        t!(Body, "[f: //]\n]"     => [func!("f: //]\n", None, true)]);
+        t!(Body, "[f: \"//]\n]"   => [func!("f: \"//]\n]", None, false)]);
+
+        // End of function with escaped brackets
+        t!(Body, "[f][\\]]"       => [func!("f", Some((0:4, 0:6, "\\]")), true)]);
+        t!(Body, "[f][\\[]"       => [func!("f", Some((0:4, 0:6, "\\[")), true)]);
+    }
+
+    #[test]
     fn tokenize_escaped_symbols() {
-        t!(Body, r"\\" => [T(r"\")]);
-        t!(Body, r"\[" => [T("[")]);
-        t!(Body, r"\]" => [T("]")]);
-        t!(Body, r"\*" => [T("*")]);
-        t!(Body, r"\_" => [T("_")]);
-        t!(Body, r"\`" => [T("`")]);
-        t!(Body, r"\/" => [T("/")]);
+        t!(Body, r"\\"   => [T(r"\")]);
+        t!(Body, r"\["   => [T("[")]);
+        t!(Body, r"\]"   => [T("]")]);
+        t!(Body, r"\*"   => [T("*")]);
+        t!(Body, r"\_"   => [T("_")]);
+        t!(Body, r"\`"   => [T("`")]);
+        t!(Body, r"\/"   => [T("/")]);
+        t!(Body, r#"\""# => [T("\"")]);
     }
 
     #[test]
author	Laurenz <laurmaedje@gmail.com>	2020-02-13 21:58:49 +0100
committer	Laurenz <laurmaedje@gmail.com>	2020-02-13 21:58:49 +0100
commit	1658b00282b631fb5218f477ea7f45f925644cea (patch)
tree	13a0f51f335e687c363a5afb9cc73993a69489cc /src/syntax/tokens.rs
parent	60099aed50b89daef29543c4700470e566c48798 (diff)