diff options
| author | Laurenz <laurmaedje@gmail.com> | 2022-10-17 16:47:07 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2022-10-17 17:11:01 +0200 |
| commit | 4fd031a256b2ecfe524859d5599fafb386395572 (patch) | |
| tree | 14787137b5188666a2133525d10ac0b72357551c /src/parse/tokens.rs | |
| parent | 54b38c479060ac06213cb311f22b84bccdf88932 (diff) | |
More spans in AST
Diffstat (limited to 'src/parse/tokens.rs')
| -rw-r--r-- | src/parse/tokens.rs | 179 |
1 files changed, 91 insertions, 88 deletions
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 7cba1823..73c64d1e 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -4,10 +4,8 @@ use unicode_xid::UnicodeXID; use unscanny::Scanner; use super::resolve::{resolve_hex, resolve_raw, resolve_string}; -use crate::diag::ErrorPos; use crate::geom::{AngleUnit, LengthUnit}; -use crate::syntax::ast::{RawNode, Unit}; -use crate::syntax::NodeKind; +use crate::syntax::{ErrorPos, NodeKind, RawKind, Unit}; use crate::util::EcoString; /// An iterator over the tokens of a string of source code. @@ -199,14 +197,25 @@ impl<'s> Tokens<'s> { '[' => NodeKind::LeftBracket, ']' => NodeKind::RightBracket, + // Multi-char things. + '#' => self.hash(start), + '.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'), + '-' => self.hyph(), + 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { + self.link(start) + } + '`' => self.raw(), + c if c.is_ascii_digit() => self.numbering(start), + '<' => self.label(), + '@' => self.reference(start), + // Escape sequences. '\\' => self.backslash(), // Single-char things. - '~' => NodeKind::Tilde, - '.' if self.s.eat_if("..") => NodeKind::Dot3, - '\'' => NodeKind::Quote { double: false }, - '"' => NodeKind::Quote { double: true }, + '~' => NodeKind::Shorthand('\u{00A0}'), + '\'' => NodeKind::SmartQuote { double: false }, + '"' => NodeKind::SmartQuote { double: true }, '*' if !self.in_word() => NodeKind::Star, '_' if !self.in_word() => NodeKind::Underscore, '$' => NodeKind::Dollar, @@ -215,17 +224,6 @@ impl<'s> Tokens<'s> { '/' => NodeKind::Slash, ':' => NodeKind::Colon, - // Multi-char things. - '#' => self.hash(start), - '-' => self.hyph(), - 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { - self.link(start) - } - '`' => self.raw(), - c if c.is_ascii_digit() => self.numbering(start), - '<' => self.label(), - '@' => self.reference(start), - // Plain text. _ => self.text(start), } @@ -291,8 +289,8 @@ impl<'s> Tokens<'s> { } // Linebreaks. - Some(c) if c.is_whitespace() => NodeKind::Backslash, - None => NodeKind::Backslash, + Some(c) if c.is_whitespace() => NodeKind::Linebreak, + None => NodeKind::Linebreak, // Escapes. Some(c) => { @@ -317,24 +315,17 @@ impl<'s> Tokens<'s> { fn hyph(&mut self) -> NodeKind { if self.s.eat_if('-') { if self.s.eat_if('-') { - NodeKind::Hyph3 + NodeKind::Shorthand('\u{2014}') } else { - NodeKind::Hyph2 + NodeKind::Shorthand('\u{2013}') } } else if self.s.eat_if('?') { - NodeKind::HyphQuest + NodeKind::Shorthand('\u{00AD}') } else { NodeKind::Minus } } - fn in_word(&self) -> bool { - let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); - let prev = self.s.scout(-2); - let next = self.s.peek(); - alphanumeric(prev) && alphanumeric(next) - } - fn link(&mut self, start: usize) -> NodeKind { #[rustfmt::skip] self.s.eat_while(|c: char| matches!(c, @@ -360,7 +351,7 @@ impl<'s> Tokens<'s> { // Special case for empty inline block. if backticks == 2 { - return NodeKind::Raw(Arc::new(RawNode { + return NodeKind::Raw(Arc::new(RawKind { text: EcoString::new(), lang: None, block: false, @@ -567,22 +558,23 @@ impl<'s> Tokens<'s> { } } - if let Ok(f) = number.parse::<f64>() { - match suffix { - "" => NodeKind::Float(f), - "pt" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Pt)), - "mm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Mm)), - "cm" => NodeKind::Numeric(f, Unit::Length(LengthUnit::Cm)), - "in" => NodeKind::Numeric(f, Unit::Length(LengthUnit::In)), - "deg" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Deg)), - "rad" => NodeKind::Numeric(f, Unit::Angle(AngleUnit::Rad)), - "em" => NodeKind::Numeric(f, Unit::Em), - "fr" => NodeKind::Numeric(f, Unit::Fr), - "%" => NodeKind::Numeric(f, Unit::Percent), - _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()), - } - } else { - NodeKind::Error(ErrorPos::Full, "invalid number".into()) + let v = match number.parse::<f64>() { + Ok(v) => v, + Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()), + }; + + match suffix { + "" => NodeKind::Float(v), + "pt" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Pt)), + "mm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Mm)), + "cm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Cm)), + "in" => NodeKind::Numeric(v, Unit::Length(LengthUnit::In)), + "deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)), + "rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)), + "em" => NodeKind::Numeric(v, Unit::Em), + "fr" => NodeKind::Numeric(v, Unit::Fr), + "%" => NodeKind::Numeric(v, Unit::Percent), + _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()), } } @@ -605,6 +597,13 @@ impl<'s> Tokens<'s> { NodeKind::Error(ErrorPos::End, "expected quote".into()) } } + + fn in_word(&self) -> bool { + let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); + let prev = self.s.scout(-2); + let next = self.s.peek(); + alphanumeric(prev) && alphanumeric(next) + } } fn keyword(ident: &str) -> Option<NodeKind> { @@ -724,7 +723,7 @@ mod tests { } fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind { - NodeKind::Raw(Arc::new(RawNode { + NodeKind::Raw(Arc::new(RawKind { text: text.into(), lang: lang.map(Into::into), block, @@ -762,6 +761,43 @@ mod tests { /// - '/': symbols const BLOCKS: &str = " a1/"; + // Suffixes described by four-tuples of: + // + // - block the suffix is part of + // - mode in which the suffix is applicable + // - the suffix string + // - the resulting suffix NodeKind + fn suffixes() + -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> { + [ + // Whitespace suffixes. + (' ', None, " ", Space(0)), + (' ', None, "\n", Space(1)), + (' ', None, "\r", Space(1)), + (' ', None, "\r\n", Space(1)), + // Letter suffixes. + ('a', Some(Markup), "hello", Text("hello")), + ('a', Some(Markup), "💚", Text("💚")), + ('a', Some(Code), "val", Ident("val")), + ('a', Some(Code), "α", Ident("α")), + ('a', Some(Code), "_", Ident("_")), + // Number suffixes. + ('1', Some(Code), "2", Int(2)), + ('1', Some(Code), ".2", Float(0.2)), + // Symbol suffixes. + ('/', None, "[", LeftBracket), + ('/', None, "//", LineComment), + ('/', None, "/**/", BlockComment), + ('/', Some(Markup), "*", Star), + ('/', Some(Markup), r"\\", Escape('\\')), + ('/', Some(Markup), "#let", Let), + ('/', Some(Code), "(", LeftParen), + ('/', Some(Code), ":", Colon), + ('/', Some(Code), "+=", PlusEq), + ] + .into_iter() + } + macro_rules! t { (Both $($tts:tt)*) => { t!(Markup $($tts)*); @@ -771,41 +807,8 @@ mod tests { // Test without suffix. t!(@$mode: $text => $($token),*); - // Suffixes described by four-tuples of: - // - // - block the suffix is part of - // - mode in which the suffix is applicable - // - the suffix string - // - the resulting suffix NodeKind - let suffixes: &[(char, Option<TokenMode>, &str, NodeKind)] = &[ - // Whitespace suffixes. - (' ', None, " ", Space(0)), - (' ', None, "\n", Space(1)), - (' ', None, "\r", Space(1)), - (' ', None, "\r\n", Space(1)), - // Letter suffixes. - ('a', Some(Markup), "hello", Text("hello")), - ('a', Some(Markup), "💚", Text("💚")), - ('a', Some(Code), "val", Ident("val")), - ('a', Some(Code), "α", Ident("α")), - ('a', Some(Code), "_", Ident("_")), - // Number suffixes. - ('1', Some(Code), "2", Int(2)), - ('1', Some(Code), ".2", Float(0.2)), - // Symbol suffixes. - ('/', None, "[", LeftBracket), - ('/', None, "//", LineComment), - ('/', None, "/**/", BlockComment), - ('/', Some(Markup), "*", Star), - ('/', Some(Markup), r"\\", Escape('\\')), - ('/', Some(Markup), "#let", Let), - ('/', Some(Code), "(", LeftParen), - ('/', Some(Code), ":", Colon), - ('/', Some(Code), "+=", PlusEq), - ]; - // Test with each applicable suffix. - for &(block, mode, suffix, ref token) in suffixes { + for (block, mode, suffix, ref token) in suffixes() { let text = $text; #[allow(unused_variables)] let blocks = BLOCKS; @@ -872,14 +875,14 @@ mod tests { t!(Markup[" /"]: "reha-world" => Text("reha-world")); // Test code symbols in text. - t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, Quote { double: true }, Text("b")); + t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b")); t!(Markup[" /"]: ";,|/+" => Text(";,|/+")); t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a")); t!(Markup[" "]: "#123" => Text("#123")); // Test text ends. t!(Markup[""]: "hello " => Text("hello"), Space(0)); - t!(Markup[""]: "hello~" => Text("hello"), Tilde); + t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}')); } #[test] @@ -924,10 +927,10 @@ mod tests { t!(Markup: "_" => Underscore); t!(Markup[""]: "===" => Eq, Eq, Eq); t!(Markup["a1/"]: "= " => Eq, Space(0)); - t!(Markup[" "]: r"\" => Backslash); - t!(Markup: "~" => Tilde); - t!(Markup["a1/"]: "-?" => HyphQuest); - t!(Markup["a "]: r"a--" => Text("a"), Hyph2); + t!(Markup[" "]: r"\" => Linebreak); + t!(Markup: "~" => Shorthand('\u{00A0}')); + t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}')); + t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}')); t!(Markup["a1/"]: "- " => Minus, Space(0)); t!(Markup[" "]: "+" => Plus); t!(Markup[" "]: "1." => EnumNumbering(1)); |
