summaryrefslogtreecommitdiff
path: root/src/syntax/tokens.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2020-09-30 12:38:02 +0200
committerLaurenz <laurmaedje@gmail.com>2020-09-30 12:45:33 +0200
commitbc1b4216a802d09e8d00dd277a0e204d49bcaa7f (patch)
tree31dabd48d5062fdd684797ed6053bf279ba67490 /src/syntax/tokens.rs
parentfee5170a68a6ef97108d731a4873787894f65a06 (diff)
Reorganize syntax types into two modules 📦
Diffstat (limited to 'src/syntax/tokens.rs')
-rw-r--r--src/syntax/tokens.rs786
1 files changed, 0 insertions, 786 deletions
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
deleted file mode 100644
index 0c37e992..00000000
--- a/src/syntax/tokens.rs
+++ /dev/null
@@ -1,786 +0,0 @@
-//! Tokenization.
-
-use std::iter::Peekable;
-use std::str::Chars;
-use unicode_xid::UnicodeXID;
-
-use super::span::{Pos, Span, Spanned};
-use crate::length::Length;
-
-use Token::*;
-use TokenMode::*;
-/// A minimal semantic entity of source code.
-#[derive(Debug, Copy, Clone, PartialEq)]
-pub enum Token<'s> {
- /// One or more whitespace characters. The contained `usize` denotes the
- /// number of newlines that were contained in the whitespace.
- Space(usize),
-
- /// A line comment with inner string contents `//<str>\n`.
- LineComment(&'s str),
- /// A block comment with inner string contents `/*<str>*/`. The comment
- /// can contain nested block comments.
- BlockComment(&'s str),
-
- /// A left bracket starting a function invocation or body: `[`.
- LeftBracket,
- /// A right bracket ending a function invocation or body: `]`.
- RightBracket,
- /// A left parenthesis in a function header: `(`.
- LeftParen,
- /// A right parenthesis in a function header: `)`.
- RightParen,
- /// A left brace in a function header: `{`.
- LeftBrace,
- /// A right brace in a function header: `}`.
- RightBrace,
- /// A double forward chevron in a function header: `>>`.
- Chain,
-
- /// A colon in a function header: `:`.
- Colon,
- /// A comma in a function header: `,`.
- Comma,
- /// An equals sign in a function header: `=`.
- Equals,
-
- /// An identifier in a function header: `center`.
- Ident(&'s str),
- /// A quoted string in a function header: `"..."`.
- Str {
- /// The string inside the quotes.
- ///
- /// _Note_: If the string contains escape sequences these are not yet
- /// applied to be able to just store a string slice here instead of
- /// a String. The escaping is done later in the parser.
- string: &'s str,
- /// Whether the closing quote was present.
- terminated: bool,
- },
- /// A boolean in a function header: `true | false`.
- Bool(bool),
- /// A number in a function header: `3.14`.
- Number(f64),
- /// A length in a function header: `12pt`.
- Length(Length),
- /// A hex value in a function header: `#20d82a`.
- Hex(&'s str),
- /// A plus in a function header, signifying the addition of expressions.
- Plus,
- /// A hyphen in a function header, signifying the subtraction of
- /// expressions.
- Hyphen,
- /// A slash in a function header, signifying the division of expressions.
- Slash,
-
- /// A star. It can appear in a function header where it signifies the
- /// multiplication of expressions or the body where it modifies the styling.
- Star,
- /// An underscore in body-text.
- Underscore,
- /// A backslash followed by whitespace in text.
- Backslash,
-
- /// A hashtag token in the body can indicate compute mode or headings.
- Hashtag,
-
- /// A unicode escape sequence.
- UnicodeEscape {
- /// The escape sequence between two braces.
- sequence: &'s str,
- /// Whether the closing brace was present.
- terminated: bool,
- },
-
- /// Raw text.
- Raw {
- /// The raw text (not yet unescaped as for strings).
- raw: &'s str,
- /// Whether the closing backtick was present.
- terminated: bool,
- },
-
- /// Multi-line code block.
- Code {
- /// The language of the code block, if specified.
- lang: Option<Spanned<&'s str>>,
- /// The raw text (not yet unescaped as for strings).
- raw: &'s str,
- /// Whether the closing backticks were present.
- terminated: bool,
- },
-
- /// Any other consecutive string.
- Text(&'s str),
-
- /// Things that are not valid in the context they appeared in.
- Invalid(&'s str),
-}
-
-impl<'s> Token<'s> {
- /// The natural-language name for this token for use in error messages.
- pub fn name(self) -> &'static str {
- match self {
- Space(_) => "space",
- LineComment(_) => "line comment",
- BlockComment(_) => "block comment",
- LeftBracket => "opening bracket",
- RightBracket => "closing bracket",
- LeftParen => "opening paren",
- RightParen => "closing paren",
- LeftBrace => "opening brace",
- RightBrace => "closing brace",
- Chain => "function chain operator",
- Colon => "colon",
- Comma => "comma",
- Equals => "equals sign",
- Ident(_) => "identifier",
- Str { .. } => "string",
- Bool(_) => "bool",
- Number(_) => "number",
- Length(_) => "length",
- Hex(_) => "hex value",
- Plus => "plus",
- Hyphen => "minus",
- Slash => "slash",
- Star => "star",
- Underscore => "underscore",
- Backslash => "backslash",
- Hashtag => "hashtag",
- UnicodeEscape { .. } => "unicode escape sequence",
- Raw { .. } => "raw text",
- Code { .. } => "code block",
- Text(_) => "text",
- Invalid("*/") => "end of block comment",
- Invalid(_) => "invalid token",
- }
- }
-}
-
-/// An iterator over the tokens of a string of source code.
-#[derive(Debug)]
-pub struct Tokens<'s> {
- src: &'s str,
- iter: Peekable<Chars<'s>>,
- mode: TokenMode,
- stack: Vec<TokenMode>,
- pos: Pos,
- index: usize,
-}
-
-/// Whether to tokenize in header mode which yields expression, comma and
-/// similar tokens or in body mode which yields text and star, underscore,
-/// backtick tokens.
-#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
-pub enum TokenMode {
- Header,
- Body,
-}
-
-impl<'s> Tokens<'s> {
- /// Create a new token iterator with the given mode.
- pub fn new(src: &'s str, mode: TokenMode) -> Self {
- Self {
- src,
- iter: src.chars().peekable(),
- mode,
- stack: vec![],
- pos: Pos::ZERO,
- index: 0,
- }
- }
-
- /// Change the token mode and push the old one on a stack.
- pub fn push_mode(&mut self, mode: TokenMode) {
- self.stack.push(self.mode);
- self.mode = mode;
- }
-
- /// Pop the old token mode from the stack. This panics if there is no mode
- /// on the stack.
- pub fn pop_mode(&mut self) {
- self.mode = self.stack.pop().expect("no pushed mode");
- }
-
- /// The index in the string at which the last token ends and next token will
- /// start.
- pub fn index(&self) -> usize {
- self.index
- }
-
- /// The line-colunn position in the source at which the last token ends and
- /// next token will start.
- pub fn pos(&self) -> Pos {
- self.pos
- }
-}
-
-impl<'s> Iterator for Tokens<'s> {
- type Item = Spanned<Token<'s>>;
-
- /// Parse the next token in the source code.
- fn next(&mut self) -> Option<Self::Item> {
- let start = self.pos();
- let first = self.eat()?;
-
- let token = match first {
- // Comments.
- '/' if self.peek() == Some('/') => self.read_line_comment(),
- '/' if self.peek() == Some('*') => self.read_block_comment(),
- '*' if self.peek() == Some('/') => {
- self.eat();
- Invalid("*/")
- }
-
- // Whitespace.
- c if c.is_whitespace() => self.read_whitespace(start),
-
- // Functions and blocks.
- '[' => LeftBracket,
- ']' => RightBracket,
- '{' => LeftBrace,
- '}' => RightBrace,
-
- // Syntactic elements in function headers.
- '(' if self.mode == Header => LeftParen,
- ')' if self.mode == Header => RightParen,
- ':' if self.mode == Header => Colon,
- ',' if self.mode == Header => Comma,
- '=' if self.mode == Header => Equals,
- '>' if self.mode == Header && self.peek() == Some('>') => self.read_chain(),
-
- // Expression operators.
- '+' if self.mode == Header => Plus,
- '-' if self.mode == Header => Hyphen,
- '/' if self.mode == Header => Slash,
-
- // Star serves a double purpose as a style modifier
- // and a expression operator in the header.
- '*' => Star,
-
- // A hex expression.
- '#' if self.mode == Header => self.read_hex(),
-
- // String values.
- '"' if self.mode == Header => self.read_string(),
-
- // Style toggles.
- '_' if self.mode == Body => Underscore,
- '`' if self.mode == Body => self.read_raw_or_code(),
-
- // Sections.
- '#' if self.mode == Body => Hashtag,
-
- // Non-breaking spaces.
- '~' if self.mode == Body => Text("\u{00A0}"),
-
- // An escaped thing.
- '\\' if self.mode == Body => self.read_escaped(),
-
- // Expressions or just strings.
- c => {
- let body = self.mode == Body;
-
- let start_offset = -(c.len_utf8() as isize);
- let mut last_was_e = false;
-
- let (text, _) = self.read_string_until(false, start_offset, 0, |n| {
- let val = match n {
- c if c.is_whitespace() => true,
- '[' | ']' | '{' | '}' | '/' | '*' => true,
- '\\' | '_' | '`' | '#' | '~' if body => true,
- ':' | '=' | ',' | '"' | '(' | ')' if !body => true,
- '+' | '-' if !body && !last_was_e => true,
- _ => false,
- };
-
- last_was_e = n == 'e' || n == 'E';
- val
- });
-
- if self.mode == Header {
- self.read_expr(text)
- } else {
- Text(text)
- }
- }
- };
-
- let end = self.pos();
- let span = Span { start, end };
-
- Some(Spanned { v: token, span })
- }
-}
-
-impl<'s> Tokens<'s> {
- fn read_line_comment(&mut self) -> Token<'s> {
- self.eat();
- LineComment(self.read_string_until(false, 0, 0, is_newline_char).0)
- }
-
- fn read_block_comment(&mut self) -> Token<'s> {
- enum Last {
- Slash,
- Star,
- Other,
- }
-
- let mut depth = 0;
- let mut last = Last::Other;
-
- // Find the first `*/` that does not correspond to a nested `/*`.
- // Remove the last two bytes to obtain the raw inner text without `*/`.
- self.eat();
- let (content, _) = self.read_string_until(true, 0, -2, |c| {
- match c {
- '/' => match last {
- Last::Star if depth == 0 => return true,
- Last::Star => depth -= 1,
- _ => last = Last::Slash,
- },
- '*' => match last {
- Last::Slash => depth += 1,
- _ => last = Last::Star,
- },
- _ => last = Last::Other,
- }
-
- false
- });
-
- BlockComment(content)
- }
-
- fn read_chain(&mut self) -> Token<'s> {
- assert!(self.eat() == Some('>'));
- Chain
- }
-
- fn read_whitespace(&mut self, start: Pos) -> Token<'s> {
- self.read_string_until(false, 0, 0, |n| !n.is_whitespace());
- let end = self.pos();
-
- Space(end.line - start.line)
- }
-
- fn read_string(&mut self) -> Token<'s> {
- let (string, terminated) = self.read_until_unescaped('"');
- Str { string, terminated }
- }
-
- fn read_raw_or_code(&mut self) -> Token<'s> {
- let (raw, terminated) = self.read_until_unescaped('`');
- if raw.is_empty() && terminated && self.peek() == Some('`') {
- // Third tick found; this is a code block.
- self.eat();
-
- // Reads the lang tag (until newline or whitespace).
- let start = self.pos();
- let (lang, _) = self.read_string_until(false, 0, 0, |c| {
- c == '`' || c.is_whitespace() || is_newline_char(c)
- });
- let end = self.pos();
-
- let lang = if !lang.is_empty() {
- Some(Spanned::new(lang, Span::new(start, end)))
- } else {
- None
- };
-
- // Skip to start of raw contents.
- while let Some(c) = self.peek() {
- if is_newline_char(c) {
- self.eat();
- if c == '\r' && self.peek() == Some('\n') {
- self.eat();
- }
-
- break;
- } else if c.is_whitespace() {
- self.eat();
- } else {
- break;
- }
- }
-
- let start = self.index();
- let mut backticks = 0u32;
-
- while backticks < 3 {
- match self.eat() {
- Some('`') => backticks += 1,
- // Escaping of triple backticks.
- Some('\\') if backticks == 1 && self.peek() == Some('`') => {
- backticks = 0;
- }
- Some(_) => {}
- None => break,
- }
- }
-
- let terminated = backticks == 3;
- let end = self.index() - if terminated { 3 } else { 0 };
-
- Code {
- lang,
- raw: &self.src[start .. end],
- terminated,
- }
- } else {
- Raw { raw, terminated }
- }
- }
-
- fn read_until_unescaped(&mut self, end: char) -> (&'s str, bool) {
- let mut escaped = false;
- self.read_string_until(true, 0, -1, |c| {
- match c {
- c if c == end && !escaped => return true,
- '\\' => escaped = !escaped,
- _ => escaped = false,
- }
-
- false
- })
- }
-
- fn read_escaped(&mut self) -> Token<'s> {
- fn is_escapable(c: char) -> bool {
- match c {
- '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => true,
- _ => false,
- }
- }
-
- match self.peek() {
- Some('u') => {
- self.eat();
- if self.peek() == Some('{') {
- self.eat();
- let (sequence, _) =
- self.read_string_until(false, 0, 0, |c| !c.is_ascii_hexdigit());
-
- let terminated = self.peek() == Some('}');
- if terminated {
- self.eat();
- }
-
- UnicodeEscape { sequence, terminated }
- } else {
- Text("\\u")
- }
- }
- Some(c) if is_escapable(c) => {
- let index = self.index();
- self.eat();
- Text(&self.src[index .. index + c.len_utf8()])
- }
- Some(c) if c.is_whitespace() => Backslash,
- Some(_) => Text("\\"),
- None => Backslash,
- }
- }
-
- fn read_hex(&mut self) -> Token<'s> {
- // This will parse more than the permissable 0-9, a-f, A-F character
- // ranges to provide nicer error messages later.
- Hex(self.read_string_until(false, 0, 0, |n| !n.is_ascii_alphanumeric()).0)
- }
-
- fn read_expr(&mut self, text: &'s str) -> Token<'s> {
- if let Ok(b) = text.parse::<bool>() {
- Bool(b)
- } else if let Ok(num) = text.parse::<f64>() {
- Number(num)
- } else if let Some(num) = parse_percentage(text) {
- Number(num / 100.0)
- } else if let Ok(length) = text.parse::<Length>() {
- Length(length)
- } else if is_identifier(text) {
- Ident(text)
- } else {
- Invalid(text)
- }
- }
-
- /// Will read the input stream until `f` evaluates to `true`. When
- /// `eat_match` is true, the token for which `f` was true is consumed.
- /// Returns the string from the index where this was called offset by
- /// `offset_start` to the end offset by `offset_end`. The end is before or
- /// after the match depending on `eat_match`.
- fn read_string_until(
- &mut self,
- eat_match: bool,
- offset_start: isize,
- offset_end: isize,
- mut f: impl FnMut(char) -> bool,
- ) -> (&'s str, bool) {
- let start = ((self.index() as isize) + offset_start) as usize;
- let mut matched = false;
-
- while let Some(c) = self.peek() {
- if f(c) {
- matched = true;
- if eat_match {
- self.eat();
- }
- break;
- }
-
- self.eat();
- }
-
- let mut end = self.index();
- if matched {
- end = ((end as isize) + offset_end) as usize;
- }
-
- (&self.src[start .. end], matched)
- }
-
- fn eat(&mut self) -> Option<char> {
- let c = self.iter.next()?;
- self.index += c.len_utf8();
-
- if is_newline_char(c) && !(c == '\r' && self.peek() == Some('\n')) {
- self.pos.line += 1;
- self.pos.column = 0;
- } else {
- self.pos.column += 1;
- }
-
- Some(c)
- }
-
- fn peek(&mut self) -> Option<char> {
- self.iter.peek().copied()
- }
-}
-
-fn parse_percentage(text: &str) -> Option<f64> {
- if text.ends_with('%') {
- text[.. text.len() - 1].parse::<f64>().ok()
- } else {
- None
- }
-}
-
-/// Whether this character denotes a newline.
-pub fn is_newline_char(character: char) -> bool {
- match character {
- // Line Feed, Vertical Tab, Form Feed, Carriage Return.
- '\x0A' ..= '\x0D' => true,
- // Next Line, Line Separator, Paragraph Separator.
- '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
- _ => false,
- }
-}
-
-/// Whether this word is a valid identifier.
-pub fn is_identifier(string: &str) -> bool {
- fn is_extra_allowed(c: char) -> bool {
- c == '.' || c == '-' || c == '_'
- }
-
- let mut chars = string.chars();
- match chars.next() {
- Some(c) if UnicodeXID::is_xid_start(c) || is_extra_allowed(c) => {}
- _ => return false,
- }
-
- for c in chars {
- match c {
- c if UnicodeXID::is_xid_continue(c) || is_extra_allowed(c) => {}
- _ => return false,
- }
- }
-
- true
-}
-
-#[cfg(test)]
-#[allow(non_snake_case)]
-mod tests {
- use super::super::span::Spanned;
- use super::*;
- use crate::length::Length;
- use crate::syntax::tests::*;
- use Token::{
- BlockComment as BC, Bool, Chain, Hex, Hyphen as Min, Ident as Id,
- LeftBrace as LB, LeftBracket as L, LeftParen as LP, Length as Len,
- LineComment as LC, Number as Num, Plus, RightBrace as RB, RightBracket as R,
- RightParen as RP, Slash, Space as S, Star, Text as T,
- };
-
- fn Str(string: &str, terminated: bool) -> Token {
- Token::Str { string, terminated }
- }
- fn Raw(raw: &str, terminated: bool) -> Token {
- Token::Raw { raw, terminated }
- }
- fn Code<'a>(
- lang: Option<Spanned<&'a str>>,
- raw: &'a str,
- terminated: bool,
- ) -> Token<'a> {
- Token::Code { lang, raw, terminated }
- }
- fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<&'a str>> {
- Some(Into::<Spanned<&str>>::into(lang))
- }
- fn UE(sequence: &str, terminated: bool) -> Token {
- Token::UnicodeEscape { sequence, terminated }
- }
-
- macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} }
- macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} }
- macro_rules! test {
- (@spans=$spans:expr, $mode:expr, $src:expr => $($token:expr),*) => {
- let exp = vec![$(Into::<Spanned<Token>>::into($token)),*];
- let found = Tokens::new($src, $mode).collect::<Vec<_>>();
- check($src, exp, found, $spans);
- }
- }
-
- #[test]
- fn tokenize_whitespace() {
- t!(Body, "" => );
- t!(Body, " " => S(0));
- t!(Body, " " => S(0));
- t!(Body, "\t" => S(0));
- t!(Body, " \t" => S(0));
- t!(Body, "\n" => S(1));
- t!(Body, "\n " => S(1));
- t!(Body, " \n" => S(1));
- t!(Body, " \n " => S(1));
- t!(Body, "\r\n" => S(1));
- t!(Body, " \n\t \n " => S(2));
- t!(Body, "\n\r" => S(2));
- t!(Body, " \r\r\n \x0D" => S(3));
- t!(Body, "a~b" => T("a"), T("\u{00A0}"), T("b"));
- }
-
- #[test]
- fn tokenize_comments() {
- t!(Body, "a // bc\n " => T("a"), S(0), LC(" bc"), S(1));
- t!(Body, "a //a//b\n " => T("a"), S(0), LC("a//b"), S(1));
- t!(Body, "a //a//b\r\n" => T("a"), S(0), LC("a//b"), S(1));
- t!(Body, "a //a//b\n\nhello" => T("a"), S(0), LC("a//b"), S(2), T("hello"));
- t!(Body, "/**/" => BC(""));
- t!(Body, "_/*_/*a*/*/" => Underscore, BC("_/*a*/"));
- t!(Body, "/*/*/" => BC("/*/"));
- t!(Body, "abc*/" => T("abc"), Invalid("*/"));
- t!(Body, "/***/" => BC("*"));
- t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/"));
- t!(Body, "/*abc" => BC("abc"));
- }
-
- #[test]
- fn tokenize_body_only_tokens() {
- t!(Body, "_*" => Underscore, Star);
- t!(Body, "***" => Star, Star, Star);
- t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star);
- t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there"));
- t!(Body, "`raw`" => Raw("raw", true));
- t!(Body, "# hi" => Hashtag, S(0), T("hi"));
- t!(Body, "#()" => Hashtag, T("()"));
- t!(Body, "`[func]`" => Raw("[func]", true));
- t!(Body, "`]" => Raw("]", false));
- t!(Body, "\\ " => Backslash, S(0));
- t!(Body, "`\\``" => Raw("\\`", true));
- t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false));
- t!(Body, "```rust hi```" => Code(Lang("rust"), "hi", true));
- t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false));
- t!(Body, "```js \r\n document.write(\"go\")" => Code(Lang("js"), " document.write(\"go\")", false));
- t!(Header, "_`" => Invalid("_`"));
- }
-
- #[test]
- fn tokenize_header_only_tokens() {
- t!(Body, "a: b" => T("a:"), S(0), T("b"));
- t!(Body, "c=d, " => T("c=d,"), S(0));
- t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma);
- t!(Header, "a:b" => Id("a"), Colon, Id("b"));
- t!(Header, "#6ae6dd" => Hex("6ae6dd"));
- t!(Header, "#8A083c" => Hex("8A083c"));
- t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0),
- Id("x"), Equals, Num(1.0));
- t!(Header, "=3.14" => Equals, Num(3.14));
- t!(Header, "12.3e5" => Num(12.3e5));
- t!(Header, "120%" => Num(1.2));
- t!(Header, "12e4%" => Num(1200.0));
- t!(Header, "__main__" => Id("__main__"));
- t!(Header, ">main" => Invalid(">main"));
- t!(Header, ".func.box" => Id(".func.box"));
- t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1"));
- t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g"));
- t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0)));
- t!(Header, "1e5in" => Len(Length::inches(100000.0)));
- t!(Header, "2.3cm" => Len(Length::cm(2.3)));
- t!(Header, "12e-3in" => Len(Length::inches(12e-3)));
- t!(Header, "6.1cm + 4pt,a=1*2" => Len(Length::cm(6.1)), S(0), Plus, S(0), Len(Length::pt(4.0)),
- Comma, Id("a"), Equals, Num(1.0), Star, Num(2.0));
- t!(Header, "(5 - 1) / 2.1" => LP, Num(5.0), S(0), Min, S(0), Num(1.0), RP,
- S(0), Slash, S(0), Num(2.1));
- t!(Header, "-1" => Min, Num(1.0));
- t!(Header, "--1" => Min, Min, Num(1.0));
- t!(Header, "- 1" => Min, S(0), Num(1.0));
- t!(Header, "02.4mm" => Len(Length::mm(2.4)));
- t!(Header, "2.4.cm" => Invalid("2.4.cm"));
- t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP);
- t!(Header, "{abc}" => LB, Id("abc"), RB);
- t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma);
- }
-
- #[test]
- fn tokenize_strings() {
- t!(Body, "a \"hi\" string" => T("a"), S(0), T("\"hi\""), S(0), T("string"));
- t!(Header, "\"hello" => Str("hello", false));
- t!(Header, "\"hello world\"" => Str("hello world", true));
- t!(Header, "\"hello\nworld\"" => Str("hello\nworld", true));
- t!(Header, r#"1"hello\nworld"false"# => Num(1.0), Str("hello\\nworld", true), Bool(false));
- t!(Header, r#""a\"bc""# => Str(r#"a\"bc"#, true));
- t!(Header, r#""a\\"bc""# => Str(r#"a\\"#, true), Id("bc"), Str("", false));
- t!(Header, r#""a\tbc"# => Str("a\\tbc", false));
- t!(Header, "\"🌎\"" => Str("🌎", true));
- }
-
- #[test]
- fn tokenize_escaped_symbols() {
- t!(Body, r"\\" => T(r"\"));
- t!(Body, r"\[" => T("["));
- t!(Body, r"\]" => T("]"));
- t!(Body, r"\*" => T("*"));
- t!(Body, r"\_" => T("_"));
- t!(Body, r"\`" => T("`"));
- t!(Body, r"\/" => T("/"));
- t!(Body, r"\u{2603}" => UE("2603", true));
- t!(Body, r"\u{26A4" => UE("26A4", false));
- t!(Body, r#"\""# => T("\""));
- }
-
- #[test]
- fn tokenize_unescapable_symbols() {
- t!(Body, r"\a" => T("\\"), T("a"));
- t!(Body, r"\:" => T(r"\"), T(":"));
- t!(Body, r"\=" => T(r"\"), T("="));
- t!(Body, r"\u{2GA4"=> UE("2", false), T("GA4"));
- t!(Body, r"\u{ " => UE("", false), Space(0));
- t!(Body, r"\u" => T(r"\u"));
- t!(Header, r"\\\\" => Invalid(r"\\\\"));
- t!(Header, r"\a" => Invalid(r"\a"));
- t!(Header, r"\:" => Invalid(r"\"), Colon);
- t!(Header, r"\=" => Invalid(r"\"), Equals);
- t!(Header, r"\," => Invalid(r"\"), Comma);
- }
-
- #[test]
- fn tokenize_with_spans() {
- ts!(Body, "hello" => s(0,0, 0,5, T("hello")));
- ts!(Body, "ab\r\nc" => s(0,0, 0,2, T("ab")), s(0,2, 1,0, S(1)), s(1,0, 1,1, T("c")));
- ts!(Body, "// ab\r\n\nf" => s(0,0, 0,5, LC(" ab")), s(0,5, 2,0, S(2)), s(2,0, 2,1, T("f")));
- ts!(Body, "/*b*/_" => s(0,0, 0,5, BC("b")), s(0,5, 0,6, Underscore));
- ts!(Header, "a=10" => s(0,0, 0,1, Id("a")), s(0,1, 0,2, Equals), s(0,2, 0,4, Num(10.0)));
- }
-}