More spans in AST

author: Laurenz <laurmaedje@gmail.com> 2022-10-17 16:47:07 +0200
committer: Laurenz <laurmaedje@gmail.com> 2022-10-17 17:11:01 +0200
commit: 4fd031a256b2ecfe524859d5599fafb386395572 (patch)
tree: 14787137b5188666a2133525d10ac0b72357551c /src/parse/tokens.rs
parent: 54b38c479060ac06213cb311f22b84bccdf88932 (diff)
1 files changed, 91 insertions, 88 deletions
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index 7cba1823..73c64d1e 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -4,10 +4,8 @@ use unicode_xid::UnicodeXID;
 use unscanny::Scanner;
 
 use super::resolve::{resolve_hex, resolve_raw, resolve_string};
-use crate::diag::ErrorPos;
 use crate::geom::{AngleUnit, LengthUnit};
-use crate::syntax::ast::{RawNode, Unit};
-use crate::syntax::NodeKind;
+use crate::syntax::{ErrorPos, NodeKind, RawKind, Unit};
 use crate::util::EcoString;
 
 /// An iterator over the tokens of a string of source code.
@@ -199,14 +197,25 @@ impl<'s> Tokens<'s> {
             '[' => NodeKind::LeftBracket,
             ']' => NodeKind::RightBracket,
 
+            // Multi-char things.
+            '#' => self.hash(start),
+            '.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'),
+            '-' => self.hyph(),
+            'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
+                self.link(start)
+            }
+            '`' => self.raw(),
+            c if c.is_ascii_digit() => self.numbering(start),
+            '<' => self.label(),
+            '@' => self.reference(start),
+
             // Escape sequences.
             '\\' => self.backslash(),
 
             // Single-char things.
-            '~' => NodeKind::Tilde,
-            '.' if self.s.eat_if("..") => NodeKind::Dot3,
-            '\'' => NodeKind::Quote { double: false },
-            '"' => NodeKind::Quote { double: true },
+            '~' => NodeKind::Shorthand('\u{00A0}'),
+            '\'' => NodeKind::SmartQuote { double: false },
+            '"' => NodeKind::SmartQuote { double: true },
             '*' if !self.in_word() => NodeKind::Star,
             '_' if !self.in_word() => NodeKind::Underscore,
             '$' => NodeKind::Dollar,
@@ -215,17 +224,6 @@ impl<'s> Tokens<'s> {
             '/' => NodeKind::Slash,
             ':' => NodeKind::Colon,
 
-            // Multi-char things.
-            '#' => self.hash(start),
-            '-' => self.hyph(),
-            'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
-                self.link(start)
-            }
-            '`' => self.raw(),
-            c if c.is_ascii_digit() => self.numbering(start),
-            '<' => self.label(),
-            '@' => self.reference(start),
-
             // Plain text.
             _ => self.text(start),
         }
@@ -291,8 +289,8 @@ impl<'s> Tokens<'s> {
             }
 
             // Linebreaks.
-            Some(c) if c.is_whitespace() => NodeKind::Backslash,
-            None => NodeKind::Backslash,
+            Some(c) if c.is_whitespace() => NodeKind::Linebreak,
+            None => NodeKind::Linebreak,
 
             // Escapes.
             Some(c) => {
@@ -317,24 +315,17 @@ impl<'s> Tokens<'s> {
     fn hyph(&mut self) -> NodeKind {
         if self.s.eat_if('-') {
             if self.s.eat_if('-') {
-                NodeKind::Hyph3
+                NodeKind::Shorthand('\u{2014}')
             } else {
-                NodeKind::Hyph2
+                NodeKind::Shorthand('\u{2013}')
             }
         } else if self.s.eat_if('?') {
-            NodeKind::HyphQuest
+            NodeKind::Shorthand('\u{00AD}')
         } else {
             NodeKind::Minus
         }
     }
 
-    fn in_word(&self) -> bool {
-        let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
-        let prev = self.s.scout(-2);
-        let next = self.s.peek();
-        alphanumeric(prev) && alphanumeric(next)
-    }
-
     fn link(&mut self, start: usize) -> NodeKind {
         #[rustfmt::skip]
         self.s.eat_while(|c: char| matches!(c,
@@ -360,7 +351,7 @@ impl<'s> Tokens<'s> {
 
         // Special case for empty inline block.
         if backticks == 2 {
-            return NodeKind::Raw(Arc::new(RawNode {
+            return NodeKind::Raw(Arc::new(RawKind {
                 text: EcoString::new(),
                 lang: None,
                 block: false,
@@ -567,22 +558,23 @@ impl<'s> Tokens<'s> {
             }
         }
 
-        if let Ok(f) = number.parse::<f64>() {
-            match suffix {
-                "" => NodeKind::Float(f),
-                "pt" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Pt)),
-                "mm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Mm)),
-                "cm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Cm)),
-                "in" => NodeKind::Numeric(f, Unit::Length(LengthUnit::In)),
-                "deg" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Deg)),
-                "rad" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Rad)),
-                "em" => NodeKind::Numeric(f, Unit::Em),
-                "fr" => NodeKind::Numeric(f, Unit::Fr),
-                "%" => NodeKind::Numeric(f, Unit::Percent),
-                _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()),
-            }
-        } else {
-            NodeKind::Error(ErrorPos::Full, "invalid number".into())
+        let v = match number.parse::<f64>() {
+            Ok(v) => v,
+            Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()),
+        };
+
+        match suffix {
+            "" => NodeKind::Float(v),
+            "pt" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Pt)),
+            "mm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Mm)),
+            "cm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Cm)),
+            "in" => NodeKind::Numeric(v, Unit::Length(LengthUnit::In)),
+            "deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)),
+            "rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)),
+            "em" => NodeKind::Numeric(v, Unit::Em),
+            "fr" => NodeKind::Numeric(v, Unit::Fr),
+            "%" => NodeKind::Numeric(v, Unit::Percent),
+            _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()),
         }
     }
 
@@ -605,6 +597,13 @@ impl<'s> Tokens<'s> {
             NodeKind::Error(ErrorPos::End, "expected quote".into())
         }
     }
+
+    fn in_word(&self) -> bool {
+        let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+        let prev = self.s.scout(-2);
+        let next = self.s.peek();
+        alphanumeric(prev) && alphanumeric(next)
+    }
 }
 
 fn keyword(ident: &str) -> Option<NodeKind> {
@@ -724,7 +723,7 @@ mod tests {
     }
 
     fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind {
-        NodeKind::Raw(Arc::new(RawNode {
+        NodeKind::Raw(Arc::new(RawKind {
             text: text.into(),
             lang: lang.map(Into::into),
             block,
@@ -762,6 +761,43 @@ mod tests {
     /// - '/': symbols
     const BLOCKS: &str = " a1/";
 
+    // Suffixes described by four-tuples of:
+    //
+    // - block the suffix is part of
+    // - mode in which the suffix is applicable
+    // - the suffix string
+    // - the resulting suffix NodeKind
+    fn suffixes()
+    -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> {
+        [
+            // Whitespace suffixes.
+            (' ', None, " ", Space(0)),
+            (' ', None, "\n", Space(1)),
+            (' ', None, "\r", Space(1)),
+            (' ', None, "\r\n", Space(1)),
+            // Letter suffixes.
+            ('a', Some(Markup), "hello", Text("hello")),
+            ('a', Some(Markup), "💚", Text("💚")),
+            ('a', Some(Code), "val", Ident("val")),
+            ('a', Some(Code), "α", Ident("α")),
+            ('a', Some(Code), "_", Ident("_")),
+            // Number suffixes.
+            ('1', Some(Code), "2", Int(2)),
+            ('1', Some(Code), ".2", Float(0.2)),
+            // Symbol suffixes.
+            ('/', None, "[", LeftBracket),
+            ('/', None, "//", LineComment),
+            ('/', None, "/**/", BlockComment),
+            ('/', Some(Markup), "*", Star),
+            ('/', Some(Markup), r"\\", Escape('\\')),
+            ('/', Some(Markup), "#let", Let),
+            ('/', Some(Code), "(", LeftParen),
+            ('/', Some(Code), ":", Colon),
+            ('/', Some(Code), "+=", PlusEq),
+        ]
+        .into_iter()
+    }
+
     macro_rules! t {
         (Both $($tts:tt)*) => {
             t!(Markup $($tts)*);
@@ -771,41 +807,8 @@ mod tests {
             // Test without suffix.
             t!(@$mode: $text => $($token),*);
 
-            // Suffixes described by four-tuples of:
-            //
-            // - block the suffix is part of
-            // - mode in which the suffix is applicable
-            // - the suffix string
-            // - the resulting suffix NodeKind
-            let suffixes: &[(char, Option<TokenMode>, &str, NodeKind)] = &[
-                // Whitespace suffixes.
-                (' ', None, " ", Space(0)),
-                (' ', None, "\n", Space(1)),
-                (' ', None, "\r", Space(1)),
-                (' ', None, "\r\n", Space(1)),
-                // Letter suffixes.
-                ('a', Some(Markup), "hello", Text("hello")),
-                ('a', Some(Markup), "💚", Text("💚")),
-                ('a', Some(Code), "val", Ident("val")),
-                ('a', Some(Code), "α", Ident("α")),
-                ('a', Some(Code), "_", Ident("_")),
-                // Number suffixes.
-                ('1', Some(Code), "2", Int(2)),
-                ('1', Some(Code), ".2", Float(0.2)),
-                // Symbol suffixes.
-                ('/', None, "[", LeftBracket),
-                ('/', None, "//", LineComment),
-                ('/', None, "/**/", BlockComment),
-                ('/', Some(Markup), "*", Star),
-                ('/', Some(Markup), r"\\", Escape('\\')),
-                ('/', Some(Markup), "#let", Let),
-                ('/', Some(Code), "(", LeftParen),
-                ('/', Some(Code), ":", Colon),
-                ('/', Some(Code), "+=", PlusEq),
-            ];
-
             // Test with each applicable suffix.
-            for &(block, mode, suffix, ref token) in suffixes {
+            for (block, mode, suffix, ref token) in suffixes() {
                 let text = $text;
                 #[allow(unused_variables)]
                 let blocks = BLOCKS;
@@ -872,14 +875,14 @@ mod tests {
         t!(Markup[" /"]: "reha-world" => Text("reha-world"));
 
         // Test code symbols in text.
-        t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, Quote { double: true }, Text("b"));
+        t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b"));
         t!(Markup[" /"]: ";,|/+"  => Text(";,|/+"));
         t!(Markup[" /"]: "=-a"     => Eq, Minus, Text("a"));
         t!(Markup[" "]: "#123"     => Text("#123"));
 
         // Test text ends.
         t!(Markup[""]: "hello " => Text("hello"), Space(0));
-        t!(Markup[""]: "hello~" => Text("hello"), Tilde);
+        t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}'));
     }
 
     #[test]
@@ -924,10 +927,10 @@ mod tests {
         t!(Markup: "_"          => Underscore);
         t!(Markup[""]: "==="    => Eq, Eq, Eq);
         t!(Markup["a1/"]: "= "  => Eq, Space(0));
-        t!(Markup[" "]: r"\"    => Backslash);
-        t!(Markup: "~"          => Tilde);
-        t!(Markup["a1/"]: "-?"  => HyphQuest);
-        t!(Markup["a "]: r"a--" => Text("a"), Hyph2);
+        t!(Markup[" "]: r"\"    => Linebreak);
+        t!(Markup: "~"          => Shorthand('\u{00A0}'));
+        t!(Markup["a1/"]: "-?"  => Shorthand('\u{00AD}'));
+        t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}'));
         t!(Markup["a1/"]: "- "  => Minus, Space(0));
         t!(Markup[" "]: "+"     => Plus);
         t!(Markup[" "]: "1."    => EnumNumbering(1));
author	Laurenz <laurmaedje@gmail.com>	2022-10-17 16:47:07 +0200
committer	Laurenz <laurmaedje@gmail.com>	2022-10-17 17:11:01 +0200
commit	4fd031a256b2ecfe524859d5599fafb386395572 (patch)
tree	14787137b5188666a2133525d10ac0b72357551c /src/parse/tokens.rs
parent	54b38c479060ac06213cb311f22b84bccdf88932 (diff)