From 01405902ba18726ccae2f71da9dfef26fac9c357 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Wed, 30 Sep 2020 19:13:55 +0200 Subject: =?UTF-8?q?Restructure=20parser=20files=20=F0=9F=8D=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parse/escaping.rs | 215 ---------------- src/parse/mod.rs | 653 ++++++++++++++++++++++++++++++++++++++++++++++- src/parse/parser.rs | 645 ---------------------------------------------- src/parse/postprocess.rs | 217 ++++++++++++++++ src/parse/tests.rs | 2 + src/parse/tokenizer.rs | 606 ------------------------------------------- src/parse/tokens.rs | 606 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1473 insertions(+), 1471 deletions(-) delete mode 100644 src/parse/escaping.rs delete mode 100644 src/parse/parser.rs create mode 100644 src/parse/postprocess.rs delete mode 100644 src/parse/tokenizer.rs create mode 100644 src/parse/tokens.rs (limited to 'src/parse') diff --git a/src/parse/escaping.rs b/src/parse/escaping.rs deleted file mode 100644 index 2e556d0c..00000000 --- a/src/parse/escaping.rs +++ /dev/null @@ -1,215 +0,0 @@ -use super::is_newline_char; -use crate::syntax::{Ident, Raw}; - -/// Resolves all escape sequences in a string. -pub fn unescape_string(string: &str) -> String { - let mut iter = string.chars().peekable(); - let mut out = String::with_capacity(string.len()); - - while let Some(c) = iter.next() { - if c != '\\' { - out.push(c); - continue; - } - - match iter.next() { - Some('\\') => out.push('\\'), - Some('"') => out.push('"'), - - Some('n') => out.push('\n'), - Some('t') => out.push('\t'), - Some('u') if iter.peek() == Some(&'{') => { - iter.next(); - - // TODO: Feedback if closing brace is missing. - let mut sequence = String::new(); - let terminated = loop { - match iter.peek() { - Some('}') => { - iter.next(); - break true; - } - Some(&c) if c.is_ascii_hexdigit() => { - iter.next(); - sequence.push(c); - } - _ => break false, - } - }; - - if let Some(c) = hex_to_char(&sequence) { - out.push(c); - } else { - // TODO: Feedback that escape sequence is wrong. - out.push_str("\\u{"); - out.push_str(&sequence); - if terminated { - out.push('}'); - } - } - } - - other => { - out.push('\\'); - out.extend(other); - } - } - } - - out -} - -/// Resolves the language tag and trims the raw text. -/// -/// Returns: -/// - The language tag -/// - The raw lines -/// - Whether at least one newline was present in the untrimmed text. -pub fn process_raw(raw: &str) -> Raw { - let (lang, inner) = split_after_lang_tag(raw); - let (lines, had_newline) = trim_and_split_raw(inner); - Raw { lang, lines, inline: !had_newline } -} - -/// Parse the lang tag and return it alongside the remaining inner raw text. -fn split_after_lang_tag(raw: &str) -> (Option, &str) { - let mut lang = String::new(); - - let mut inner = raw; - let mut iter = raw.chars(); - - while let Some(c) = iter.next() { - if c == '`' || c.is_whitespace() || is_newline_char(c) { - break; - } - - inner = iter.as_str(); - lang.push(c); - } - - (Ident::new(lang), inner) -} - -/// Trims raw text and splits it into lines. -/// -/// Returns whether at least one newline was contained in `raw`. -fn trim_and_split_raw(raw: &str) -> (Vec, bool) { - // Trims one whitespace at end and start. - let raw = raw.strip_prefix(' ').unwrap_or(raw); - let raw = raw.strip_suffix(' ').unwrap_or(raw); - - let mut lines = split_lines(raw); - let had_newline = lines.len() > 1; - let is_whitespace = |line: &String| line.chars().all(char::is_whitespace); - - // Trims a sequence of whitespace followed by a newline at the start. - if lines.first().map(is_whitespace).unwrap_or(false) { - lines.remove(0); - } - - // Trims a newline followed by a sequence of whitespace at the end. - if lines.last().map(is_whitespace).unwrap_or(false) { - lines.pop(); - } - - (lines, had_newline) -} - -/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks). -pub fn split_lines(text: &str) -> Vec { - let mut iter = text.chars().peekable(); - let mut line = String::new(); - let mut lines = Vec::new(); - - while let Some(c) = iter.next() { - if is_newline_char(c) { - if c == '\r' && iter.peek() == Some(&'\n') { - iter.next(); - } - - lines.push(std::mem::take(&mut line)); - } else { - line.push(c); - } - } - - lines.push(line); - lines -} - -/// Converts a hexademical sequence (without braces or "\u") into a character. -pub fn hex_to_char(sequence: &str) -> Option { - u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) -} - -#[cfg(test)] -#[rustfmt::skip] -mod tests { - use super::*; - - #[test] - fn test_unescape_strings() { - fn test(string: &str, expected: &str) { - assert_eq!(unescape_string(string), expected.to_string()); - } - - test(r#"hello world"#, "hello world"); - test(r#"hello\nworld"#, "hello\nworld"); - test(r#"a\"bc"#, "a\"bc"); - test(r#"a\u{2603}bc"#, "a☃bc"); - test(r#"a\u{26c3bg"#, "a𦰻g"); - test(r#"av\u{6797"#, "av林"); - test(r#"a\\"#, "a\\"); - test(r#"a\\\nbc"#, "a\\\nbc"); - test(r#"a\tbc"#, "a\tbc"); - test(r"🌎", "🌎"); - test(r"🌎\", r"🌎\"); - test(r"\🌎", r"\🌎"); - } - - #[test] - fn test_split_after_lang_tag() { - fn test(raw: &str, lang: Option<&str>, inner: &str) { - let (found_lang, found_inner) = split_after_lang_tag(raw); - assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang); - assert_eq!(found_inner, inner); - } - - test("typst it!", Some("typst"), " it!"); - test("typst\n it!", Some("typst"), "\n it!"); - test("typst\n it!", Some("typst"), "\n it!"); - test("abc`", Some("abc"), "`"); - test(" hi", None, " hi"); - test("`", None, "`"); - } - - #[test] - fn test_trim_raw() { - fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(trim_and_split_raw(raw).0, expected); - } - - test(" hi", vec!["hi"]); - test(" hi", vec![" hi"]); - test("\nhi", vec!["hi"]); - test(" \n hi", vec![" hi"]); - test("hi ", vec!["hi"]); - test("hi ", vec!["hi "]); - test("hi\n", vec!["hi"]); - test("hi \n ", vec!["hi "]); - test(" \n hi \n ", vec![" hi "]); - } - - #[test] - fn test_split_lines() { - fn test(raw: &str, expected: Vec<&str>) { - assert_eq!(split_lines(raw), expected); - } - - test("raw\ntext", vec!["raw", "text"]); - test("a\r\nb", vec!["a", "b"]); - test("a\n\nb", vec!["a", "", "b"]); - test("a\r\x0Bb", vec!["a", "", "b"]); - test("a\r\n\r\nb", vec!["a", "", "b"]); - } -} diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 340e89ea..e7ab89f1 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,11 +1,654 @@ //! Parsing and tokenization. -mod escaping; -mod parser; -mod tokenizer; +mod postprocess; +mod tokens; -pub use parser::*; -pub use tokenizer::*; +pub use tokens::*; + +use std::str::FromStr; + +use super::*; +use crate::color::RgbaColor; +use crate::compute::table::SpannedEntry; +use crate::syntax::*; +use crate::{Feedback, Pass}; + +/// Parse a string of source code. +pub fn parse(src: &str) -> Pass { + Parser::new(src).parse() +} + +struct Parser<'s> { + tokens: Tokens<'s>, + peeked: Option>>>, + delimiters: Vec<(Pos, Token<'static>)>, + at_block_or_line_start: bool, + feedback: Feedback, +} + +impl<'s> Parser<'s> { + fn new(src: &'s str) -> Self { + Self { + tokens: Tokens::new(src, TokenMode::Body), + peeked: None, + delimiters: vec![], + at_block_or_line_start: true, + feedback: Feedback::new(), + } + } + + fn parse(mut self) -> Pass { + let tree = self.parse_body_contents(); + Pass::new(tree, self.feedback) + } +} + +// Typesetting content. +impl Parser<'_> { + fn parse_body_contents(&mut self) -> SyntaxTree { + let mut tree = SyntaxTree::new(); + + self.at_block_or_line_start = true; + while !self.eof() { + if let Some(node) = self.parse_node() { + tree.push(node); + } + } + + tree + } + + fn parse_node(&mut self) -> Option> { + let token = self.peek()?; + let end = Span::at(token.span.end); + + // Set block or line start to false because most nodes have that effect, but + // remember the old value to actually check it for hashtags and because comments + // and spaces want to retain it. + let was_at_block_or_line_start = self.at_block_or_line_start; + self.at_block_or_line_start = false; + + Some(match token.v { + // Starting from two newlines counts as a paragraph break, a single + // newline does not. + Token::Space(n) => { + if n == 0 { + self.at_block_or_line_start = was_at_block_or_line_start; + } else if n >= 1 { + self.at_block_or_line_start = true; + } + + self.with_span(if n >= 2 { + SyntaxNode::Parbreak + } else { + SyntaxNode::Spacing + }) + } + + Token::LineComment(_) | Token::BlockComment(_) => { + self.at_block_or_line_start = was_at_block_or_line_start; + self.eat(); + return None; + } + + Token::LeftBracket => { + let call = self.parse_bracket_call(false); + self.at_block_or_line_start = false; + call.map(SyntaxNode::Call) + } + + Token::Star => self.with_span(SyntaxNode::ToggleBolder), + Token::Underscore => self.with_span(SyntaxNode::ToggleItalic), + Token::Backslash => self.with_span(SyntaxNode::Linebreak), + + Token::Hashtag if was_at_block_or_line_start => { + self.parse_heading().map(SyntaxNode::Heading) + } + + Token::Raw { raw, backticks, terminated } => { + if !terminated { + error!(@self.feedback, end, "expected backtick(s)"); + } + + let raw = if backticks > 1 { + postprocess::process_raw(raw) + } else { + Raw { + lang: None, + lines: postprocess::split_lines(raw), + inline: true, + } + }; + + self.with_span(SyntaxNode::Raw(raw)) + } + + Token::Text(text) => self.with_span(SyntaxNode::Text(text.to_string())), + Token::Hashtag => self.with_span(SyntaxNode::Text("#".to_string())), + + Token::UnicodeEscape { sequence, terminated } => { + if !terminated { + error!(@self.feedback, end, "expected closing brace"); + } + + if let Some(c) = postprocess::hex_to_char(sequence) { + self.with_span(SyntaxNode::Text(c.to_string())) + } else { + error!(@self.feedback, token.span, "invalid unicode escape sequence"); + self.eat(); + return None; + } + } + + unexpected => { + error!(@self.feedback, token.span, "unexpected {}", unexpected.name()); + self.eat(); + return None; + } + }) + } + + fn parse_heading(&mut self) -> Spanned { + let start = self.pos(); + self.assert(Token::Hashtag); + + let mut level = 0; + while self.peekv() == Some(Token::Hashtag) { + level += 1; + self.eat(); + } + + let span = Span::new(start, self.pos()); + let level = level.span_with(span); + + if level.v > 5 { + warning!( + @self.feedback, level.span, + "section depth larger than 6 has no effect", + ); + } + + self.skip_ws(); + + let mut tree = SyntaxTree::new(); + while !self.eof() && !matches!(self.peekv(), Some(Token::Space(n)) if n >= 1) { + if let Some(node) = self.parse_node() { + tree.push(node); + } + } + + let span = Span::new(start, self.pos()); + Heading { level, tree }.span_with(span) + } +} + +// Function calls. +impl Parser<'_> { + fn parse_bracket_call(&mut self, chained: bool) -> Spanned { + let before_bracket = self.pos(); + if !chained { + self.start_group(Group::Bracket); + self.tokens.push_mode(TokenMode::Header); + } + + let before_name = self.pos(); + self.start_group(Group::Subheader); + self.skip_ws(); + let name = self.parse_ident().unwrap_or_else(|| { + self.expected_found_or_at("function name", before_name); + Ident(String::new()).span_with(Span::at(before_name)) + }); + + self.skip_ws(); + + let mut args = match self.eatv() { + Some(Token::Colon) => self.parse_table_contents().0, + Some(_) => { + self.expected_at("colon", name.span.end); + while self.eat().is_some() {} + TableExpr::new() + } + None => TableExpr::new(), + }; + + self.end_group(); + self.skip_ws(); + let (has_chained_child, end) = if self.peek().is_some() { + let item = self.parse_bracket_call(true); + let span = item.span; + let t = vec![item.map(SyntaxNode::Call)]; + args.push(SpannedEntry::val(Expr::Tree(t).span_with(span))); + (true, span.end) + } else { + self.tokens.pop_mode(); + (false, self.end_group().end) + }; + + let start = if chained { before_name } else { before_bracket }; + let mut span = Span::new(start, end); + + if self.check(Token::LeftBracket) && !has_chained_child { + self.start_group(Group::Bracket); + self.tokens.push_mode(TokenMode::Body); + + let body = self.parse_body_contents(); + + self.tokens.pop_mode(); + let body_span = self.end_group(); + + let expr = Expr::Tree(body); + args.push(SpannedEntry::val(expr.span_with(body_span))); + span.expand(body_span); + } + + CallExpr { name, args }.span_with(span) + } + + fn parse_paren_call(&mut self, name: Spanned) -> Spanned { + self.start_group(Group::Paren); + let args = self.parse_table_contents().0; + let args_span = self.end_group(); + let span = Span::merge(name.span, args_span); + CallExpr { name, args }.span_with(span) + } +} + +// Tables. +impl Parser<'_> { + fn parse_table_contents(&mut self) -> (TableExpr, bool) { + let mut table = TableExpr::new(); + let mut comma_and_keyless = true; + + while { + self.skip_ws(); + !self.eof() + } { + let (key, val) = if let Some(ident) = self.parse_ident() { + self.skip_ws(); + + match self.peekv() { + Some(Token::Equals) => { + self.eat(); + self.skip_ws(); + if let Some(value) = self.parse_expr() { + (Some(ident), value) + } else { + self.expected("value"); + continue; + } + } + + Some(Token::LeftParen) => { + let call = self.parse_paren_call(ident); + (None, call.map(Expr::Call)) + } + + _ => (None, ident.map(Expr::Ident)), + } + } else if let Some(value) = self.parse_expr() { + (None, value) + } else { + self.expected("value"); + continue; + }; + + let behind = val.span.end; + if let Some(key) = key { + comma_and_keyless = false; + table.insert(key.v.0, SpannedEntry::new(key.span, val)); + self.feedback + .decorations + .push(Decoration::TableKey.span_with(key.span)); + } else { + table.push(SpannedEntry::val(val)); + } + + if { + self.skip_ws(); + self.eof() + } { + break; + } + + self.expect_at(Token::Comma, behind); + comma_and_keyless = false; + } + + let coercable = comma_and_keyless && !table.is_empty(); + (table, coercable) + } +} + +type Binop = fn(Box>, Box>) -> Expr; + +// Expressions and values. +impl Parser<'_> { + fn parse_expr(&mut self) -> Option> { + self.parse_binops("summand", Self::parse_term, |token| match token { + Token::Plus => Some(Expr::Add), + Token::Hyphen => Some(Expr::Sub), + _ => None, + }) + } + + fn parse_term(&mut self) -> Option> { + self.parse_binops("factor", Self::parse_factor, |token| match token { + Token::Star => Some(Expr::Mul), + Token::Slash => Some(Expr::Div), + _ => None, + }) + } + + /// Parse expression of the form ` ( )*`. + fn parse_binops( + &mut self, + operand_name: &str, + mut parse_operand: impl FnMut(&mut Self) -> Option>, + mut parse_op: impl FnMut(Token) -> Option, + ) -> Option> { + let mut left = parse_operand(self)?; + + self.skip_ws(); + while let Some(token) = self.peek() { + if let Some(op) = parse_op(token.v) { + self.eat(); + self.skip_ws(); + + if let Some(right) = parse_operand(self) { + let span = Span::merge(left.span, right.span); + let v = op(Box::new(left), Box::new(right)); + left = v.span_with(span); + self.skip_ws(); + continue; + } + + error!( + @self.feedback, Span::merge(left.span, token.span), + "missing right {}", operand_name, + ); + } + break; + } + + Some(left) + } + + fn parse_factor(&mut self) -> Option> { + if let Some(hyph) = self.check_eat(Token::Hyphen) { + self.skip_ws(); + if let Some(factor) = self.parse_factor() { + let span = Span::merge(hyph.span, factor.span); + Some(Expr::Neg(Box::new(factor)).span_with(span)) + } else { + error!(@self.feedback, hyph.span, "dangling minus"); + None + } + } else { + self.parse_value() + } + } + + fn parse_value(&mut self) -> Option> { + let Spanned { v: token, span } = self.peek()?; + Some(match token { + // This could be a function call or an identifier. + Token::Ident(id) => { + let name = Ident(id.to_string()).span_with(span); + self.eat(); + self.skip_ws(); + if self.check(Token::LeftParen) { + self.parse_paren_call(name).map(Expr::Call) + } else { + name.map(Expr::Ident) + } + } + + Token::Str { string, terminated } => { + if !terminated { + self.expected_at("quote", span.end); + } + self.with_span(Expr::Str(postprocess::unescape_string(string))) + } + + Token::Bool(b) => self.with_span(Expr::Bool(b)), + Token::Number(n) => self.with_span(Expr::Number(n)), + Token::Length(s) => self.with_span(Expr::Length(s)), + Token::Hex(s) => { + if let Ok(color) = RgbaColor::from_str(s) { + self.with_span(Expr::Color(color)) + } else { + // Heal color by assuming black. + error!(@self.feedback, span, "invalid color"); + let healed = RgbaColor::new_healed(0, 0, 0, 255); + self.with_span(Expr::Color(healed)) + } + } + + // This could be a table or a parenthesized expression. We parse as + // a table in any case and coerce the table into a value if it is + // coercable (length 1 and no trailing comma). + Token::LeftParen => { + self.start_group(Group::Paren); + let (table, coercable) = self.parse_table_contents(); + let span = self.end_group(); + + let expr = if coercable { + table.into_values().next().expect("table is coercable").val.v + } else { + Expr::Table(table) + }; + + expr.span_with(span) + } + + // This is a content expression. + Token::LeftBrace => { + self.start_group(Group::Brace); + self.tokens.push_mode(TokenMode::Body); + + let tree = self.parse_body_contents(); + + self.tokens.pop_mode(); + let span = self.end_group(); + Expr::Tree(tree).span_with(span) + } + + // This is a bracketed function call. + Token::LeftBracket => { + let call = self.parse_bracket_call(false); + let tree = vec![call.map(SyntaxNode::Call)]; + Expr::Tree(tree).span_with(span) + } + + _ => return None, + }) + } + + fn parse_ident(&mut self) -> Option> { + self.peek().and_then(|token| match token.v { + Token::Ident(id) => Some(self.with_span(Ident(id.to_string()))), + _ => None, + }) + } +} + +// Error handling. +impl Parser<'_> { + fn expect_at(&mut self, token: Token<'_>, pos: Pos) -> bool { + if self.check(token) { + self.eat(); + true + } else { + self.expected_at(token.name(), pos); + false + } + } + + fn expected(&mut self, thing: &str) { + if let Some(found) = self.eat() { + error!( + @self.feedback, found.span, + "expected {}, found {}", thing, found.v.name(), + ); + } else { + error!(@self.feedback, Span::at(self.pos()), "expected {}", thing); + } + } + + fn expected_at(&mut self, thing: &str, pos: Pos) { + error!(@self.feedback, Span::at(pos), "expected {}", thing); + } + + fn expected_found_or_at(&mut self, thing: &str, pos: Pos) { + if self.eof() { + self.expected_at(thing, pos) + } else { + self.expected(thing); + } + } +} + +// Parsing primitives. +impl<'s> Parser<'s> { + fn start_group(&mut self, group: Group) { + let start = self.pos(); + if let Some(start_token) = group.start() { + self.assert(start_token); + } + self.delimiters.push((start, group.end())); + } + + fn end_group(&mut self) -> Span { + let peeked = self.peek(); + + let (start, end_token) = self.delimiters.pop().expect("group was not started"); + + if end_token != Token::Chain && peeked != None { + self.delimiters.push((start, end_token)); + assert_eq!(peeked, None, "unfinished group"); + } + + match self.peeked.unwrap() { + Some(token) if token.v == end_token => { + self.peeked = None; + Span::new(start, token.span.end) + } + _ => { + let end = self.pos(); + if end_token != Token::Chain { + error!( + @self.feedback, Span::at(end), + "expected {}", end_token.name(), + ); + } + Span::new(start, end) + } + } + } + + fn skip_ws(&mut self) { + while matches!( + self.peekv(), + Some(Token::Space(_)) | + Some(Token::LineComment(_)) | + Some(Token::BlockComment(_)) + ) { + self.eat(); + } + } + + fn eatv(&mut self) -> Option> { + self.eat().map(Spanned::value) + } + + fn peekv(&mut self) -> Option> { + self.peek().map(Spanned::value) + } + + fn assert(&mut self, token: Token<'_>) { + assert!(self.check_eat(token).is_some()); + } + + fn check_eat(&mut self, token: Token<'_>) -> Option>> { + if self.check(token) { self.eat() } else { None } + } + + /// Checks if the next token is of some kind + fn check(&mut self, token: Token<'_>) -> bool { + self.peekv() == Some(token) + } + + fn with_span(&mut self, v: T) -> Spanned { + let span = self.eat().expect("expected token").span; + v.span_with(span) + } + + fn eof(&mut self) -> bool { + self.peek().is_none() + } + + fn eat(&mut self) -> Option>> { + let token = self.peek()?; + self.peeked = None; + Some(token) + } + + fn peek(&mut self) -> Option>> { + let tokens = &mut self.tokens; + let token = (*self.peeked.get_or_insert_with(|| tokens.next()))?; + + // Check for unclosed groups. + if Group::is_delimiter(token.v) { + if self.delimiters.iter().rev().any(|&(_, end)| token.v == end) { + return None; + } + } + + Some(token) + } + + fn pos(&self) -> Pos { + self.peeked + .flatten() + .map(|s| s.span.start) + .unwrap_or_else(|| self.tokens.pos()) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum Group { + Paren, + Bracket, + Brace, + Subheader, +} + +impl Group { + fn is_delimiter(token: Token<'_>) -> bool { + matches!( + token, + Token::RightParen | Token::RightBracket | Token::RightBrace | Token::Chain + ) + } + + fn start(self) -> Option> { + match self { + Self::Paren => Some(Token::LeftParen), + Self::Bracket => Some(Token::LeftBracket), + Self::Brace => Some(Token::LeftBrace), + Self::Subheader => None, + } + } + + fn end(self) -> Token<'static> { + match self { + Self::Paren => Token::RightParen, + Self::Bracket => Token::RightBracket, + Self::Brace => Token::RightBrace, + Self::Subheader => Token::Chain, + } + } +} #[cfg(test)] mod tests; diff --git a/src/parse/parser.rs b/src/parse/parser.rs deleted file mode 100644 index 3446af83..00000000 --- a/src/parse/parser.rs +++ /dev/null @@ -1,645 +0,0 @@ -use std::str::FromStr; - -use super::escaping::*; -use super::*; -use crate::color::RgbaColor; -use crate::compute::table::SpannedEntry; -use crate::syntax::*; -use crate::{Feedback, Pass}; - -/// Parse a string of source code. -pub fn parse(src: &str) -> Pass { - Parser::new(src).parse() -} - -struct Parser<'s> { - tokens: Tokens<'s>, - peeked: Option>>>, - delimiters: Vec<(Pos, Token<'static>)>, - at_block_or_line_start: bool, - feedback: Feedback, -} - -impl<'s> Parser<'s> { - fn new(src: &'s str) -> Self { - Self { - tokens: Tokens::new(src, TokenMode::Body), - peeked: None, - delimiters: vec![], - at_block_or_line_start: true, - feedback: Feedback::new(), - } - } - - fn parse(mut self) -> Pass { - let tree = self.parse_body_contents(); - Pass::new(tree, self.feedback) - } -} - -// Typesetting content. -impl Parser<'_> { - fn parse_body_contents(&mut self) -> SyntaxTree { - let mut tree = SyntaxTree::new(); - - self.at_block_or_line_start = true; - while !self.eof() { - if let Some(node) = self.parse_node() { - tree.push(node); - } - } - - tree - } - - fn parse_node(&mut self) -> Option> { - let token = self.peek()?; - let end = Span::at(token.span.end); - - // Set block or line start to false because most nodes have that effect, but - // remember the old value to actually check it for hashtags and because comments - // and spaces want to retain it. - let was_at_block_or_line_start = self.at_block_or_line_start; - self.at_block_or_line_start = false; - - Some(match token.v { - // Starting from two newlines counts as a paragraph break, a single - // newline does not. - Token::Space(n) => { - if n == 0 { - self.at_block_or_line_start = was_at_block_or_line_start; - } else if n >= 1 { - self.at_block_or_line_start = true; - } - - self.with_span(if n >= 2 { - SyntaxNode::Parbreak - } else { - SyntaxNode::Spacing - }) - } - - Token::LineComment(_) | Token::BlockComment(_) => { - self.at_block_or_line_start = was_at_block_or_line_start; - self.eat(); - return None; - } - - Token::LeftBracket => { - let call = self.parse_bracket_call(false); - self.at_block_or_line_start = false; - call.map(SyntaxNode::Call) - } - - Token::Star => self.with_span(SyntaxNode::ToggleBolder), - Token::Underscore => self.with_span(SyntaxNode::ToggleItalic), - Token::Backslash => self.with_span(SyntaxNode::Linebreak), - - Token::Hashtag if was_at_block_or_line_start => { - self.parse_heading().map(SyntaxNode::Heading) - } - - Token::Raw { raw, backticks, terminated } => { - if !terminated { - error!(@self.feedback, end, "expected backtick(s)"); - } - - let raw = if backticks > 1 { - process_raw(raw) - } else { - Raw { - lang: None, - lines: split_lines(raw), - inline: true, - } - }; - - self.with_span(SyntaxNode::Raw(raw)) - } - - Token::Text(text) => self.with_span(SyntaxNode::Text(text.to_string())), - Token::Hashtag => self.with_span(SyntaxNode::Text("#".to_string())), - - Token::UnicodeEscape { sequence, terminated } => { - if !terminated { - error!(@self.feedback, end, "expected closing brace"); - } - - if let Some(c) = hex_to_char(sequence) { - self.with_span(SyntaxNode::Text(c.to_string())) - } else { - error!(@self.feedback, token.span, "invalid unicode escape sequence"); - self.eat(); - return None; - } - } - - unexpected => { - error!(@self.feedback, token.span, "unexpected {}", unexpected.name()); - self.eat(); - return None; - } - }) - } - - fn parse_heading(&mut self) -> Spanned { - let start = self.pos(); - self.assert(Token::Hashtag); - - let mut level = 0; - while self.peekv() == Some(Token::Hashtag) { - level += 1; - self.eat(); - } - - let span = Span::new(start, self.pos()); - let level = level.span_with(span); - - if level.v > 5 { - warning!( - @self.feedback, level.span, - "section depth larger than 6 has no effect", - ); - } - - self.skip_ws(); - - let mut tree = SyntaxTree::new(); - while !self.eof() && !matches!(self.peekv(), Some(Token::Space(n)) if n >= 1) { - if let Some(node) = self.parse_node() { - tree.push(node); - } - } - - let span = Span::new(start, self.pos()); - Heading { level, tree }.span_with(span) - } -} - -// Function calls. -impl Parser<'_> { - fn parse_bracket_call(&mut self, chained: bool) -> Spanned { - let before_bracket = self.pos(); - if !chained { - self.start_group(Group::Bracket); - self.tokens.push_mode(TokenMode::Header); - } - - let before_name = self.pos(); - self.start_group(Group::Subheader); - self.skip_ws(); - let name = self.parse_ident().unwrap_or_else(|| { - self.expected_found_or_at("function name", before_name); - Ident(String::new()).span_with(Span::at(before_name)) - }); - - self.skip_ws(); - - let mut args = match self.eatv() { - Some(Token::Colon) => self.parse_table_contents().0, - Some(_) => { - self.expected_at("colon", name.span.end); - while self.eat().is_some() {} - TableExpr::new() - } - None => TableExpr::new(), - }; - - self.end_group(); - self.skip_ws(); - let (has_chained_child, end) = if self.peek().is_some() { - let item = self.parse_bracket_call(true); - let span = item.span; - let t = vec![item.map(SyntaxNode::Call)]; - args.push(SpannedEntry::val(Expr::Tree(t).span_with(span))); - (true, span.end) - } else { - self.tokens.pop_mode(); - (false, self.end_group().end) - }; - - let start = if chained { before_name } else { before_bracket }; - let mut span = Span::new(start, end); - - if self.check(Token::LeftBracket) && !has_chained_child { - self.start_group(Group::Bracket); - self.tokens.push_mode(TokenMode::Body); - - let body = self.parse_body_contents(); - - self.tokens.pop_mode(); - let body_span = self.end_group(); - - let expr = Expr::Tree(body); - args.push(SpannedEntry::val(expr.span_with(body_span))); - span.expand(body_span); - } - - CallExpr { name, args }.span_with(span) - } - - fn parse_paren_call(&mut self, name: Spanned) -> Spanned { - self.start_group(Group::Paren); - let args = self.parse_table_contents().0; - let args_span = self.end_group(); - let span = Span::merge(name.span, args_span); - CallExpr { name, args }.span_with(span) - } -} - -// Tables. -impl Parser<'_> { - fn parse_table_contents(&mut self) -> (TableExpr, bool) { - let mut table = TableExpr::new(); - let mut comma_and_keyless = true; - - while { - self.skip_ws(); - !self.eof() - } { - let (key, val) = if let Some(ident) = self.parse_ident() { - self.skip_ws(); - - match self.peekv() { - Some(Token::Equals) => { - self.eat(); - self.skip_ws(); - if let Some(value) = self.parse_expr() { - (Some(ident), value) - } else { - self.expected("value"); - continue; - } - } - - Some(Token::LeftParen) => { - let call = self.parse_paren_call(ident); - (None, call.map(Expr::Call)) - } - - _ => (None, ident.map(Expr::Ident)), - } - } else if let Some(value) = self.parse_expr() { - (None, value) - } else { - self.expected("value"); - continue; - }; - - let behind = val.span.end; - if let Some(key) = key { - comma_and_keyless = false; - table.insert(key.v.0, SpannedEntry::new(key.span, val)); - self.feedback - .decorations - .push(Decoration::TableKey.span_with(key.span)); - } else { - table.push(SpannedEntry::val(val)); - } - - if { - self.skip_ws(); - self.eof() - } { - break; - } - - self.expect_at(Token::Comma, behind); - comma_and_keyless = false; - } - - let coercable = comma_and_keyless && !table.is_empty(); - (table, coercable) - } -} - -type Binop = fn(Box>, Box>) -> Expr; - -// Expressions and values. -impl Parser<'_> { - fn parse_expr(&mut self) -> Option> { - self.parse_binops("summand", Self::parse_term, |token| match token { - Token::Plus => Some(Expr::Add), - Token::Hyphen => Some(Expr::Sub), - _ => None, - }) - } - - fn parse_term(&mut self) -> Option> { - self.parse_binops("factor", Self::parse_factor, |token| match token { - Token::Star => Some(Expr::Mul), - Token::Slash => Some(Expr::Div), - _ => None, - }) - } - - /// Parse expression of the form ` ( )*`. - fn parse_binops( - &mut self, - operand_name: &str, - mut parse_operand: impl FnMut(&mut Self) -> Option>, - mut parse_op: impl FnMut(Token) -> Option, - ) -> Option> { - let mut left = parse_operand(self)?; - - self.skip_ws(); - while let Some(token) = self.peek() { - if let Some(op) = parse_op(token.v) { - self.eat(); - self.skip_ws(); - - if let Some(right) = parse_operand(self) { - let span = Span::merge(left.span, right.span); - let v = op(Box::new(left), Box::new(right)); - left = v.span_with(span); - self.skip_ws(); - continue; - } - - error!( - @self.feedback, Span::merge(left.span, token.span), - "missing right {}", operand_name, - ); - } - break; - } - - Some(left) - } - - fn parse_factor(&mut self) -> Option> { - if let Some(hyph) = self.check_eat(Token::Hyphen) { - self.skip_ws(); - if let Some(factor) = self.parse_factor() { - let span = Span::merge(hyph.span, factor.span); - Some(Expr::Neg(Box::new(factor)).span_with(span)) - } else { - error!(@self.feedback, hyph.span, "dangling minus"); - None - } - } else { - self.parse_value() - } - } - - fn parse_value(&mut self) -> Option> { - let Spanned { v: token, span } = self.peek()?; - Some(match token { - // This could be a function call or an identifier. - Token::Ident(id) => { - let name = Ident(id.to_string()).span_with(span); - self.eat(); - self.skip_ws(); - if self.check(Token::LeftParen) { - self.parse_paren_call(name).map(Expr::Call) - } else { - name.map(Expr::Ident) - } - } - - Token::Str { string, terminated } => { - if !terminated { - self.expected_at("quote", span.end); - } - self.with_span(Expr::Str(unescape_string(string))) - } - - Token::Bool(b) => self.with_span(Expr::Bool(b)), - Token::Number(n) => self.with_span(Expr::Number(n)), - Token::Length(s) => self.with_span(Expr::Length(s)), - Token::Hex(s) => { - if let Ok(color) = RgbaColor::from_str(s) { - self.with_span(Expr::Color(color)) - } else { - // Heal color by assuming black. - error!(@self.feedback, span, "invalid color"); - let healed = RgbaColor::new_healed(0, 0, 0, 255); - self.with_span(Expr::Color(healed)) - } - } - - // This could be a table or a parenthesized expression. We parse as - // a table in any case and coerce the table into a value if it is - // coercable (length 1 and no trailing comma). - Token::LeftParen => { - self.start_group(Group::Paren); - let (table, coercable) = self.parse_table_contents(); - let span = self.end_group(); - - let expr = if coercable { - table.into_values().next().expect("table is coercable").val.v - } else { - Expr::Table(table) - }; - - expr.span_with(span) - } - - // This is a content expression. - Token::LeftBrace => { - self.start_group(Group::Brace); - self.tokens.push_mode(TokenMode::Body); - - let tree = self.parse_body_contents(); - - self.tokens.pop_mode(); - let span = self.end_group(); - Expr::Tree(tree).span_with(span) - } - - // This is a bracketed function call. - Token::LeftBracket => { - let call = self.parse_bracket_call(false); - let tree = vec![call.map(SyntaxNode::Call)]; - Expr::Tree(tree).span_with(span) - } - - _ => return None, - }) - } - - fn parse_ident(&mut self) -> Option> { - self.peek().and_then(|token| match token.v { - Token::Ident(id) => Some(self.with_span(Ident(id.to_string()))), - _ => None, - }) - } -} - -// Error handling. -impl Parser<'_> { - fn expect_at(&mut self, token: Token<'_>, pos: Pos) -> bool { - if self.check(token) { - self.eat(); - true - } else { - self.expected_at(token.name(), pos); - false - } - } - - fn expected(&mut self, thing: &str) { - if let Some(found) = self.eat() { - error!( - @self.feedback, found.span, - "expected {}, found {}", thing, found.v.name(), - ); - } else { - error!(@self.feedback, Span::at(self.pos()), "expected {}", thing); - } - } - - fn expected_at(&mut self, thing: &str, pos: Pos) { - error!(@self.feedback, Span::at(pos), "expected {}", thing); - } - - fn expected_found_or_at(&mut self, thing: &str, pos: Pos) { - if self.eof() { - self.expected_at(thing, pos) - } else { - self.expected(thing); - } - } -} - -// Parsing primitives. -impl<'s> Parser<'s> { - fn start_group(&mut self, group: Group) { - let start = self.pos(); - if let Some(start_token) = group.start() { - self.assert(start_token); - } - self.delimiters.push((start, group.end())); - } - - fn end_group(&mut self) -> Span { - let peeked = self.peek(); - - let (start, end_token) = self.delimiters.pop().expect("group was not started"); - - if end_token != Token::Chain && peeked != None { - self.delimiters.push((start, end_token)); - assert_eq!(peeked, None, "unfinished group"); - } - - match self.peeked.unwrap() { - Some(token) if token.v == end_token => { - self.peeked = None; - Span::new(start, token.span.end) - } - _ => { - let end = self.pos(); - if end_token != Token::Chain { - error!( - @self.feedback, Span::at(end), - "expected {}", end_token.name(), - ); - } - Span::new(start, end) - } - } - } - - fn skip_ws(&mut self) { - while matches!( - self.peekv(), - Some(Token::Space(_)) | - Some(Token::LineComment(_)) | - Some(Token::BlockComment(_)) - ) { - self.eat(); - } - } - - fn eatv(&mut self) -> Option> { - self.eat().map(Spanned::value) - } - - fn peekv(&mut self) -> Option> { - self.peek().map(Spanned::value) - } - - fn assert(&mut self, token: Token<'_>) { - assert!(self.check_eat(token).is_some()); - } - - fn check_eat(&mut self, token: Token<'_>) -> Option>> { - if self.check(token) { self.eat() } else { None } - } - - /// Checks if the next token is of some kind - fn check(&mut self, token: Token<'_>) -> bool { - self.peekv() == Some(token) - } - - fn with_span(&mut self, v: T) -> Spanned { - let span = self.eat().expect("expected token").span; - v.span_with(span) - } - - fn eof(&mut self) -> bool { - self.peek().is_none() - } - - fn eat(&mut self) -> Option>> { - let token = self.peek()?; - self.peeked = None; - Some(token) - } - - fn peek(&mut self) -> Option>> { - let tokens = &mut self.tokens; - let token = (*self.peeked.get_or_insert_with(|| tokens.next()))?; - - // Check for unclosed groups. - if Group::is_delimiter(token.v) { - if self.delimiters.iter().rev().any(|&(_, end)| token.v == end) { - return None; - } - } - - Some(token) - } - - fn pos(&self) -> Pos { - self.peeked - .flatten() - .map(|s| s.span.start) - .unwrap_or_else(|| self.tokens.pos()) - } -} - -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum Group { - Paren, - Bracket, - Brace, - Subheader, -} - -impl Group { - fn is_delimiter(token: Token<'_>) -> bool { - matches!( - token, - Token::RightParen | Token::RightBracket | Token::RightBrace | Token::Chain - ) - } - - fn start(self) -> Option> { - match self { - Self::Paren => Some(Token::LeftParen), - Self::Bracket => Some(Token::LeftBracket), - Self::Brace => Some(Token::LeftBrace), - Self::Subheader => None, - } - } - - fn end(self) -> Token<'static> { - match self { - Self::Paren => Token::RightParen, - Self::Bracket => Token::RightBracket, - Self::Brace => Token::RightBrace, - Self::Subheader => Token::Chain, - } - } -} diff --git a/src/parse/postprocess.rs b/src/parse/postprocess.rs new file mode 100644 index 00000000..ad4a9057 --- /dev/null +++ b/src/parse/postprocess.rs @@ -0,0 +1,217 @@ +//! Post-processing of strings and raw blocks. + +use super::is_newline_char; +use crate::syntax::{Ident, Raw}; + +/// Resolves all escape sequences in a string. +pub fn unescape_string(string: &str) -> String { + let mut iter = string.chars().peekable(); + let mut out = String::with_capacity(string.len()); + + while let Some(c) = iter.next() { + if c != '\\' { + out.push(c); + continue; + } + + match iter.next() { + Some('\\') => out.push('\\'), + Some('"') => out.push('"'), + + Some('n') => out.push('\n'), + Some('t') => out.push('\t'), + Some('u') if iter.peek() == Some(&'{') => { + iter.next(); + + // TODO: Feedback if closing brace is missing. + let mut sequence = String::new(); + let terminated = loop { + match iter.peek() { + Some('}') => { + iter.next(); + break true; + } + Some(&c) if c.is_ascii_hexdigit() => { + iter.next(); + sequence.push(c); + } + _ => break false, + } + }; + + if let Some(c) = hex_to_char(&sequence) { + out.push(c); + } else { + // TODO: Feedback that escape sequence is wrong. + out.push_str("\\u{"); + out.push_str(&sequence); + if terminated { + out.push('}'); + } + } + } + + other => { + out.push('\\'); + out.extend(other); + } + } + } + + out +} + +/// Resolves the language tag and trims the raw text. +/// +/// Returns: +/// - The language tag +/// - The raw lines +/// - Whether at least one newline was present in the untrimmed text. +pub fn process_raw(raw: &str) -> Raw { + let (lang, inner) = split_after_lang_tag(raw); + let (lines, had_newline) = trim_and_split_raw(inner); + Raw { lang, lines, inline: !had_newline } +} + +/// Parse the lang tag and return it alongside the remaining inner raw text. +fn split_after_lang_tag(raw: &str) -> (Option, &str) { + let mut lang = String::new(); + + let mut inner = raw; + let mut iter = raw.chars(); + + while let Some(c) = iter.next() { + if c == '`' || c.is_whitespace() || is_newline_char(c) { + break; + } + + inner = iter.as_str(); + lang.push(c); + } + + (Ident::new(lang), inner) +} + +/// Trims raw text and splits it into lines. +/// +/// Returns whether at least one newline was contained in `raw`. +fn trim_and_split_raw(raw: &str) -> (Vec, bool) { + // Trims one whitespace at end and start. + let raw = raw.strip_prefix(' ').unwrap_or(raw); + let raw = raw.strip_suffix(' ').unwrap_or(raw); + + let mut lines = split_lines(raw); + let had_newline = lines.len() > 1; + let is_whitespace = |line: &String| line.chars().all(char::is_whitespace); + + // Trims a sequence of whitespace followed by a newline at the start. + if lines.first().map(is_whitespace).unwrap_or(false) { + lines.remove(0); + } + + // Trims a newline followed by a sequence of whitespace at the end. + if lines.last().map(is_whitespace).unwrap_or(false) { + lines.pop(); + } + + (lines, had_newline) +} + +/// Splits a string into a vector of lines (respecting Unicode & Windows line breaks). +pub fn split_lines(text: &str) -> Vec { + let mut iter = text.chars().peekable(); + let mut line = String::new(); + let mut lines = Vec::new(); + + while let Some(c) = iter.next() { + if is_newline_char(c) { + if c == '\r' && iter.peek() == Some(&'\n') { + iter.next(); + } + + lines.push(std::mem::take(&mut line)); + } else { + line.push(c); + } + } + + lines.push(line); + lines +} + +/// Converts a hexademical sequence (without braces or "\u") into a character. +pub fn hex_to_char(sequence: &str) -> Option { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + +#[cfg(test)] +#[rustfmt::skip] +mod tests { + use super::*; + + #[test] + fn test_unescape_strings() { + fn test(string: &str, expected: &str) { + assert_eq!(unescape_string(string), expected.to_string()); + } + + test(r#"hello world"#, "hello world"); + test(r#"hello\nworld"#, "hello\nworld"); + test(r#"a\"bc"#, "a\"bc"); + test(r#"a\u{2603}bc"#, "a☃bc"); + test(r#"a\u{26c3bg"#, "a𦰻g"); + test(r#"av\u{6797"#, "av林"); + test(r#"a\\"#, "a\\"); + test(r#"a\\\nbc"#, "a\\\nbc"); + test(r#"a\tbc"#, "a\tbc"); + test(r"🌎", "🌎"); + test(r"🌎\", r"🌎\"); + test(r"\🌎", r"\🌎"); + } + + #[test] + fn test_split_after_lang_tag() { + fn test(raw: &str, lang: Option<&str>, inner: &str) { + let (found_lang, found_inner) = split_after_lang_tag(raw); + assert_eq!(found_lang.as_ref().map(|id| id.as_str()), lang); + assert_eq!(found_inner, inner); + } + + test("typst it!", Some("typst"), " it!"); + test("typst\n it!", Some("typst"), "\n it!"); + test("typst\n it!", Some("typst"), "\n it!"); + test("abc`", Some("abc"), "`"); + test(" hi", None, " hi"); + test("`", None, "`"); + } + + #[test] + fn test_trim_raw() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(trim_and_split_raw(raw).0, expected); + } + + test(" hi", vec!["hi"]); + test(" hi", vec![" hi"]); + test("\nhi", vec!["hi"]); + test(" \n hi", vec![" hi"]); + test("hi ", vec!["hi"]); + test("hi ", vec!["hi "]); + test("hi\n", vec!["hi"]); + test("hi \n ", vec!["hi "]); + test(" \n hi \n ", vec![" hi "]); + } + + #[test] + fn test_split_lines() { + fn test(raw: &str, expected: Vec<&str>) { + assert_eq!(split_lines(raw), expected); + } + + test("raw\ntext", vec!["raw", "text"]); + test("a\r\nb", vec!["a", "b"]); + test("a\n\nb", vec!["a", "", "b"]); + test("a\r\x0Bb", vec!["a", "", "b"]); + test("a\r\n\r\nb", vec!["a", "", "b"]); + } +} diff --git a/src/parse/tests.rs b/src/parse/tests.rs index 8ddf013d..a753378e 100644 --- a/src/parse/tests.rs +++ b/src/parse/tests.rs @@ -1,3 +1,5 @@ +//! Parser tests. + #![allow(non_snake_case)] use std::fmt::Debug; diff --git a/src/parse/tokenizer.rs b/src/parse/tokenizer.rs deleted file mode 100644 index 720bec43..00000000 --- a/src/parse/tokenizer.rs +++ /dev/null @@ -1,606 +0,0 @@ -//! Tokenization. - -use std::iter::Peekable; -use std::str::Chars; -use unicode_xid::UnicodeXID; - -use crate::length::Length; -use crate::syntax::{Pos, Span, SpanWith, Spanned, Token}; - -use Token::*; -use TokenMode::*; - -/// An iterator over the tokens of a string of source code. -#[derive(Debug)] -pub struct Tokens<'s> { - src: &'s str, - iter: Peekable>, - mode: TokenMode, - stack: Vec, - index: usize, -} - -/// Whether to tokenize in header mode which yields expression, comma and -/// similar tokens or in body mode which yields text and star, underscore, -/// backtick tokens. -#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] -pub enum TokenMode { - Header, - Body, -} - -impl<'s> Tokens<'s> { - /// Create a new token iterator with the given mode. - pub fn new(src: &'s str, mode: TokenMode) -> Self { - Self { - src, - iter: src.chars().peekable(), - mode, - stack: vec![], - index: 0, - } - } - - /// Change the token mode and push the old one on a stack. - pub fn push_mode(&mut self, mode: TokenMode) { - self.stack.push(self.mode); - self.mode = mode; - } - - /// Pop the old token mode from the stack. This panics if there is no mode - /// on the stack. - pub fn pop_mode(&mut self) { - self.mode = self.stack.pop().expect("no pushed mode"); - } - - /// The position in the string at which the last token ends and next token - /// will start. - pub fn pos(&self) -> Pos { - self.index.into() - } -} - -impl<'s> Iterator for Tokens<'s> { - type Item = Spanned>; - - /// Parse the next token in the source code. - fn next(&mut self) -> Option { - let start = self.pos(); - let first = self.eat()?; - - let token = match first { - // Comments. - '/' if self.peek() == Some('/') => self.read_line_comment(), - '/' if self.peek() == Some('*') => self.read_block_comment(), - '*' if self.peek() == Some('/') => { - self.eat(); - Invalid("*/") - } - - // Whitespace. - c if c.is_whitespace() => self.read_whitespace(c), - - // Functions and blocks. - '[' => LeftBracket, - ']' => RightBracket, - '{' => LeftBrace, - '}' => RightBrace, - - // Syntactic elements in function headers. - '(' if self.mode == Header => LeftParen, - ')' if self.mode == Header => RightParen, - ':' if self.mode == Header => Colon, - ',' if self.mode == Header => Comma, - '=' if self.mode == Header => Equals, - '>' if self.mode == Header && self.peek() == Some('>') => self.read_chain(), - - // Expression operators. - '+' if self.mode == Header => Plus, - '-' if self.mode == Header => Hyphen, - '/' if self.mode == Header => Slash, - - // Star serves a double purpose as a style modifier - // and a expression operator in the header. - '*' => Star, - - // A hex expression. - '#' if self.mode == Header => self.read_hex(), - - // String values. - '"' if self.mode == Header => self.read_string(), - - // Style toggles. - '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw(), - - // Sections. - '#' if self.mode == Body => Hashtag, - - // Non-breaking spaces. - '~' if self.mode == Body => Text("\u{00A0}"), - - // An escaped thing. - '\\' if self.mode == Body => self.read_escaped(), - - // Expressions or just strings. - c => { - let body = self.mode == Body; - - let start_offset = -(c.len_utf8() as isize); - let mut last_was_e = false; - - let (text, _) = self.read_string_until(false, start_offset, 0, |n| { - let val = match n { - c if c.is_whitespace() => true, - '[' | ']' | '{' | '}' | '/' | '*' => true, - '\\' | '_' | '`' | '#' | '~' if body => true, - ':' | '=' | ',' | '"' | '(' | ')' if !body => true, - '+' | '-' if !body && !last_was_e => true, - _ => false, - }; - - last_was_e = n == 'e' || n == 'E'; - val - }); - - if self.mode == Header { - self.read_expr(text) - } else { - Text(text) - } - } - }; - - let end = self.pos(); - - Some(token.span_with(Span::new(start, end))) - } -} - -impl<'s> Tokens<'s> { - fn read_line_comment(&mut self) -> Token<'s> { - self.eat(); - LineComment(self.read_string_until(false, 0, 0, is_newline_char).0) - } - - fn read_block_comment(&mut self) -> Token<'s> { - enum Last { - Slash, - Star, - Other, - } - - let mut depth = 0; - let mut last = Last::Other; - - // Find the first `*/` that does not correspond to a nested `/*`. - // Remove the last two bytes to obtain the raw inner text without `*/`. - self.eat(); - let (content, _) = self.read_string_until(true, 0, -2, |c| { - match c { - '/' => match last { - Last::Star if depth == 0 => return true, - Last::Star => depth -= 1, - _ => last = Last::Slash, - }, - '*' => match last { - Last::Slash => depth += 1, - _ => last = Last::Star, - }, - _ => last = Last::Other, - } - - false - }); - - BlockComment(content) - } - - fn read_chain(&mut self) -> Token<'s> { - assert!(self.eat() == Some('>')); - Chain - } - - fn read_whitespace(&mut self, mut c: char) -> Token<'s> { - let mut newlines = 0; - - loop { - if is_newline_char(c) { - if c == '\r' && self.peek() == Some('\n') { - self.eat(); - } - - newlines += 1; - } - - match self.peek() { - Some(n) if n.is_whitespace() => { - self.eat(); - c = n; - } - _ => break, - } - } - - Space(newlines) - } - - fn read_string(&mut self) -> Token<'s> { - let (string, terminated) = self.read_until_unescaped('"'); - Str { string, terminated } - } - - fn read_raw(&mut self) -> Token<'s> { - let mut backticks = 1; - while self.peek() == Some('`') { - self.eat(); - backticks += 1; - } - - let start = self.index; - - let mut found = 0; - while found < backticks { - match self.eat() { - Some('`') => found += 1, - Some(_) => found = 0, - None => break, - } - } - - let terminated = found == backticks; - let end = self.index - if terminated { found } else { 0 }; - - Raw { - raw: &self.src[start .. end], - backticks, - terminated, - } - } - - fn read_until_unescaped(&mut self, end: char) -> (&'s str, bool) { - let mut escaped = false; - self.read_string_until(true, 0, -1, |c| { - match c { - c if c == end && !escaped => return true, - '\\' => escaped = !escaped, - _ => escaped = false, - } - - false - }) - } - - fn read_escaped(&mut self) -> Token<'s> { - fn is_escapable(c: char) -> bool { - match c { - '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => true, - _ => false, - } - } - - match self.peek() { - Some('u') => { - self.eat(); - if self.peek() == Some('{') { - self.eat(); - let (sequence, _) = - self.read_string_until(false, 0, 0, |c| !c.is_ascii_hexdigit()); - - let terminated = self.peek() == Some('}'); - if terminated { - self.eat(); - } - - UnicodeEscape { sequence, terminated } - } else { - Text("\\u") - } - } - Some(c) if is_escapable(c) => { - let index = self.index; - self.eat(); - Text(&self.src[index .. index + c.len_utf8()]) - } - Some(c) if c.is_whitespace() => Backslash, - Some(_) => Text("\\"), - None => Backslash, - } - } - - fn read_hex(&mut self) -> Token<'s> { - // This will parse more than the permissable 0-9, a-f, A-F character - // ranges to provide nicer error messages later. - Hex(self.read_string_until(false, 0, 0, |n| !n.is_ascii_alphanumeric()).0) - } - - fn read_expr(&mut self, text: &'s str) -> Token<'s> { - if let Ok(b) = text.parse::() { - Bool(b) - } else if let Ok(num) = text.parse::() { - Number(num) - } else if let Some(num) = parse_percentage(text) { - Number(num / 100.0) - } else if let Ok(length) = text.parse::() { - Length(length) - } else if is_identifier(text) { - Ident(text) - } else { - Invalid(text) - } - } - - /// Will read the input stream until `f` evaluates to `true`. When - /// `eat_match` is true, the token for which `f` was true is consumed. - /// Returns the string from the index where this was called offset by - /// `offset_start` to the end offset by `offset_end`. The end is before or - /// after the match depending on `eat_match`. - fn read_string_until( - &mut self, - eat_match: bool, - offset_start: isize, - offset_end: isize, - mut f: impl FnMut(char) -> bool, - ) -> (&'s str, bool) { - let start = ((self.index as isize) + offset_start) as usize; - let mut matched = false; - - while let Some(c) = self.peek() { - if f(c) { - matched = true; - if eat_match { - self.eat(); - } - break; - } - - self.eat(); - } - - let mut end = self.index; - if matched { - end = ((end as isize) + offset_end) as usize; - } - - (&self.src[start .. end], matched) - } - - fn eat(&mut self) -> Option { - let c = self.iter.next()?; - self.index += c.len_utf8(); - Some(c) - } - - fn peek(&mut self) -> Option { - self.iter.peek().copied() - } -} - -fn parse_percentage(text: &str) -> Option { - if text.ends_with('%') { - text[.. text.len() - 1].parse::().ok() - } else { - None - } -} - -/// Whether this character denotes a newline. -pub fn is_newline_char(character: char) -> bool { - match character { - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\x0A' ..= '\x0D' => true, - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } -} - -/// Whether this word is a valid identifier. -pub fn is_identifier(string: &str) -> bool { - fn is_extra_allowed(c: char) -> bool { - c == '.' || c == '-' || c == '_' - } - - let mut chars = string.chars(); - match chars.next() { - Some(c) if UnicodeXID::is_xid_start(c) || is_extra_allowed(c) => {} - _ => return false, - } - - for c in chars { - match c { - c if UnicodeXID::is_xid_continue(c) || is_extra_allowed(c) => {} - _ => return false, - } - } - - true -} - -#[cfg(test)] -#[allow(non_snake_case)] -mod tests { - use super::*; - use crate::length::Length; - use crate::parse::tests::{check, s}; - - use Token::{ - BlockComment as BC, Bool, Chain, Hex, Hyphen as Min, Ident as Id, - LeftBrace as LB, LeftBracket as L, LeftParen as LP, Length as Len, - LineComment as LC, Number as Num, Plus, RightBrace as RB, RightBracket as R, - RightParen as RP, Slash, Space as S, Star, Text as T, - }; - - fn Str(string: &str, terminated: bool) -> Token { - Token::Str { string, terminated } - } - fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { - Token::Raw { raw, backticks, terminated } - } - fn UE(sequence: &str, terminated: bool) -> Token { - Token::UnicodeEscape { sequence, terminated } - } - - macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } - macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } - macro_rules! test { - (@spans=$spans:expr, $mode:expr, $src:expr => $($token:expr),*) => { - let exp = vec![$(Into::>::into($token)),*]; - let found = Tokens::new($src, $mode).collect::>(); - check($src, exp, found, $spans); - } - } - - #[test] - fn tokenize_whitespace() { - t!(Body, "" => ); - t!(Body, " " => S(0)); - t!(Body, " " => S(0)); - t!(Body, "\t" => S(0)); - t!(Body, " \t" => S(0)); - t!(Body, "\n" => S(1)); - t!(Body, "\n " => S(1)); - t!(Body, " \n" => S(1)); - t!(Body, " \n " => S(1)); - t!(Body, "\r\n" => S(1)); - t!(Body, " \n\t \n " => S(2)); - t!(Body, "\n\r" => S(2)); - t!(Body, " \r\r\n \x0D" => S(3)); - t!(Body, "a~b" => T("a"), T("\u{00A0}"), T("b")); - } - - #[test] - fn tokenize_comments() { - t!(Body, "a // bc\n " => T("a"), S(0), LC(" bc"), S(1)); - t!(Body, "a //a//b\n " => T("a"), S(0), LC("a//b"), S(1)); - t!(Body, "a //a//b\r\n" => T("a"), S(0), LC("a//b"), S(1)); - t!(Body, "a //a//b\n\nhello" => T("a"), S(0), LC("a//b"), S(2), T("hello")); - t!(Body, "/**/" => BC("")); - t!(Body, "_/*_/*a*/*/" => Underscore, BC("_/*a*/")); - t!(Body, "/*/*/" => BC("/*/")); - t!(Body, "abc*/" => T("abc"), Invalid("*/")); - t!(Body, "/***/" => BC("*")); - t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/")); - t!(Body, "/*abc" => BC("abc")); - } - - #[test] - fn tokenize_body_only_tokens() { - t!(Body, "_*" => Underscore, Star); - t!(Body, "***" => Star, Star, Star); - t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star); - t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there")); - t!(Body, "# hi" => Hashtag, S(0), T("hi")); - t!(Body, "#()" => Hashtag, T("()")); - t!(Header, "_`" => Invalid("_`")); - } - - #[test] - fn test_tokenize_raw() { - // Basics. - t!(Body, "`raw`" => Raw("raw", 1, true)); - t!(Body, "`[func]`" => Raw("[func]", 1, true)); - t!(Body, "`]" => Raw("]", 1, false)); - t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); - - // Language tag. - t!(Body, "``` hi```" => Raw(" hi", 3, true)); - t!(Body, "```rust hi```" => Raw("rust hi", 3, true)); - t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false)); - t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true)); - t!(Body, "```js \r\n document.write(\"go\")`" - => Raw("js \r\n document.write(\"go\")`", 3, false)); - - // More backticks. - t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi")); - t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true)); - } - - #[test] - fn tokenize_header_only_tokens() { - t!(Body, "a: b" => T("a:"), S(0), T("b")); - t!(Body, "c=d, " => T("c=d,"), S(0)); - t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); - t!(Header, "a:b" => Id("a"), Colon, Id("b")); - t!(Header, "#6ae6dd" => Hex("6ae6dd")); - t!(Header, "#8A083c" => Hex("8A083c")); - t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), - Id("x"), Equals, Num(1.0)); - t!(Header, "=3.14" => Equals, Num(3.14)); - t!(Header, "12.3e5" => Num(12.3e5)); - t!(Header, "120%" => Num(1.2)); - t!(Header, "12e4%" => Num(1200.0)); - t!(Header, "__main__" => Id("__main__")); - t!(Header, ">main" => Invalid(">main")); - t!(Header, ".func.box" => Id(".func.box")); - t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); - t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); - t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); - t!(Header, "1e5in" => Len(Length::inches(100000.0))); - t!(Header, "2.3cm" => Len(Length::cm(2.3))); - t!(Header, "12e-3in" => Len(Length::inches(12e-3))); - t!(Header, "6.1cm + 4pt,a=1*2" => Len(Length::cm(6.1)), S(0), Plus, S(0), Len(Length::pt(4.0)), - Comma, Id("a"), Equals, Num(1.0), Star, Num(2.0)); - t!(Header, "(5 - 1) / 2.1" => LP, Num(5.0), S(0), Min, S(0), Num(1.0), RP, - S(0), Slash, S(0), Num(2.1)); - t!(Header, "-1" => Min, Num(1.0)); - t!(Header, "--1" => Min, Min, Num(1.0)); - t!(Header, "- 1" => Min, S(0), Num(1.0)); - t!(Header, "02.4mm" => Len(Length::mm(2.4))); - t!(Header, "2.4.cm" => Invalid("2.4.cm")); - t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); - t!(Header, "{abc}" => LB, Id("abc"), RB); - t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); - } - - #[test] - fn tokenize_strings() { - t!(Body, "a \"hi\" string" => T("a"), S(0), T("\"hi\""), S(0), T("string")); - t!(Header, "\"hello" => Str("hello", false)); - t!(Header, "\"hello world\"" => Str("hello world", true)); - t!(Header, "\"hello\nworld\"" => Str("hello\nworld", true)); - t!(Header, r#"1"hello\nworld"false"# => Num(1.0), Str("hello\\nworld", true), Bool(false)); - t!(Header, r#""a\"bc""# => Str(r#"a\"bc"#, true)); - t!(Header, r#""a\\"bc""# => Str(r#"a\\"#, true), Id("bc"), Str("", false)); - t!(Header, r#""a\tbc"# => Str("a\\tbc", false)); - t!(Header, "\"🌎\"" => Str("🌎", true)); - } - - #[test] - fn tokenize_escaped_symbols() { - t!(Body, r"\\" => T(r"\")); - t!(Body, r"\[" => T("[")); - t!(Body, r"\]" => T("]")); - t!(Body, r"\*" => T("*")); - t!(Body, r"\_" => T("_")); - t!(Body, r"\`" => T("`")); - t!(Body, r"\/" => T("/")); - t!(Body, r"\u{2603}" => UE("2603", true)); - t!(Body, r"\u{26A4" => UE("26A4", false)); - t!(Body, r#"\""# => T("\"")); - } - - #[test] - fn tokenize_unescapable_symbols() { - t!(Body, r"\a" => T("\\"), T("a")); - t!(Body, r"\:" => T(r"\"), T(":")); - t!(Body, r"\=" => T(r"\"), T("=")); - t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); - t!(Body, r"\u{ " => UE("", false), Space(0)); - t!(Body, r"\u" => T(r"\u")); - t!(Header, r"\\\\" => Invalid(r"\\\\")); - t!(Header, r"\a" => Invalid(r"\a")); - t!(Header, r"\:" => Invalid(r"\"), Colon); - t!(Header, r"\=" => Invalid(r"\"), Equals); - t!(Header, r"\," => Invalid(r"\"), Comma); - } - - #[test] - fn tokenize_with_spans() { - ts!(Body, "hello" => s(0, 5, T("hello"))); - ts!(Body, "ab\r\nc" => s(0, 2, T("ab")), s(2, 4, S(1)), s(4, 5, T("c"))); - ts!(Body, "// ab\r\n\nf" => s(0, 5, LC(" ab")), s(5, 8, S(2)), s(8, 9, T("f"))); - ts!(Body, "/*b*/_" => s(0, 5, BC("b")), s(5, 6, Underscore)); - ts!(Header, "a=10" => s(0, 1, Id("a")), s(1, 2, Equals), s(2, 4, Num(10.0))); - } -} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs new file mode 100644 index 00000000..720bec43 --- /dev/null +++ b/src/parse/tokens.rs @@ -0,0 +1,606 @@ +//! Tokenization. + +use std::iter::Peekable; +use std::str::Chars; +use unicode_xid::UnicodeXID; + +use crate::length::Length; +use crate::syntax::{Pos, Span, SpanWith, Spanned, Token}; + +use Token::*; +use TokenMode::*; + +/// An iterator over the tokens of a string of source code. +#[derive(Debug)] +pub struct Tokens<'s> { + src: &'s str, + iter: Peekable>, + mode: TokenMode, + stack: Vec, + index: usize, +} + +/// Whether to tokenize in header mode which yields expression, comma and +/// similar tokens or in body mode which yields text and star, underscore, +/// backtick tokens. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub enum TokenMode { + Header, + Body, +} + +impl<'s> Tokens<'s> { + /// Create a new token iterator with the given mode. + pub fn new(src: &'s str, mode: TokenMode) -> Self { + Self { + src, + iter: src.chars().peekable(), + mode, + stack: vec![], + index: 0, + } + } + + /// Change the token mode and push the old one on a stack. + pub fn push_mode(&mut self, mode: TokenMode) { + self.stack.push(self.mode); + self.mode = mode; + } + + /// Pop the old token mode from the stack. This panics if there is no mode + /// on the stack. + pub fn pop_mode(&mut self) { + self.mode = self.stack.pop().expect("no pushed mode"); + } + + /// The position in the string at which the last token ends and next token + /// will start. + pub fn pos(&self) -> Pos { + self.index.into() + } +} + +impl<'s> Iterator for Tokens<'s> { + type Item = Spanned>; + + /// Parse the next token in the source code. + fn next(&mut self) -> Option { + let start = self.pos(); + let first = self.eat()?; + + let token = match first { + // Comments. + '/' if self.peek() == Some('/') => self.read_line_comment(), + '/' if self.peek() == Some('*') => self.read_block_comment(), + '*' if self.peek() == Some('/') => { + self.eat(); + Invalid("*/") + } + + // Whitespace. + c if c.is_whitespace() => self.read_whitespace(c), + + // Functions and blocks. + '[' => LeftBracket, + ']' => RightBracket, + '{' => LeftBrace, + '}' => RightBrace, + + // Syntactic elements in function headers. + '(' if self.mode == Header => LeftParen, + ')' if self.mode == Header => RightParen, + ':' if self.mode == Header => Colon, + ',' if self.mode == Header => Comma, + '=' if self.mode == Header => Equals, + '>' if self.mode == Header && self.peek() == Some('>') => self.read_chain(), + + // Expression operators. + '+' if self.mode == Header => Plus, + '-' if self.mode == Header => Hyphen, + '/' if self.mode == Header => Slash, + + // Star serves a double purpose as a style modifier + // and a expression operator in the header. + '*' => Star, + + // A hex expression. + '#' if self.mode == Header => self.read_hex(), + + // String values. + '"' if self.mode == Header => self.read_string(), + + // Style toggles. + '_' if self.mode == Body => Underscore, + '`' if self.mode == Body => self.read_raw(), + + // Sections. + '#' if self.mode == Body => Hashtag, + + // Non-breaking spaces. + '~' if self.mode == Body => Text("\u{00A0}"), + + // An escaped thing. + '\\' if self.mode == Body => self.read_escaped(), + + // Expressions or just strings. + c => { + let body = self.mode == Body; + + let start_offset = -(c.len_utf8() as isize); + let mut last_was_e = false; + + let (text, _) = self.read_string_until(false, start_offset, 0, |n| { + let val = match n { + c if c.is_whitespace() => true, + '[' | ']' | '{' | '}' | '/' | '*' => true, + '\\' | '_' | '`' | '#' | '~' if body => true, + ':' | '=' | ',' | '"' | '(' | ')' if !body => true, + '+' | '-' if !body && !last_was_e => true, + _ => false, + }; + + last_was_e = n == 'e' || n == 'E'; + val + }); + + if self.mode == Header { + self.read_expr(text) + } else { + Text(text) + } + } + }; + + let end = self.pos(); + + Some(token.span_with(Span::new(start, end))) + } +} + +impl<'s> Tokens<'s> { + fn read_line_comment(&mut self) -> Token<'s> { + self.eat(); + LineComment(self.read_string_until(false, 0, 0, is_newline_char).0) + } + + fn read_block_comment(&mut self) -> Token<'s> { + enum Last { + Slash, + Star, + Other, + } + + let mut depth = 0; + let mut last = Last::Other; + + // Find the first `*/` that does not correspond to a nested `/*`. + // Remove the last two bytes to obtain the raw inner text without `*/`. + self.eat(); + let (content, _) = self.read_string_until(true, 0, -2, |c| { + match c { + '/' => match last { + Last::Star if depth == 0 => return true, + Last::Star => depth -= 1, + _ => last = Last::Slash, + }, + '*' => match last { + Last::Slash => depth += 1, + _ => last = Last::Star, + }, + _ => last = Last::Other, + } + + false + }); + + BlockComment(content) + } + + fn read_chain(&mut self) -> Token<'s> { + assert!(self.eat() == Some('>')); + Chain + } + + fn read_whitespace(&mut self, mut c: char) -> Token<'s> { + let mut newlines = 0; + + loop { + if is_newline_char(c) { + if c == '\r' && self.peek() == Some('\n') { + self.eat(); + } + + newlines += 1; + } + + match self.peek() { + Some(n) if n.is_whitespace() => { + self.eat(); + c = n; + } + _ => break, + } + } + + Space(newlines) + } + + fn read_string(&mut self) -> Token<'s> { + let (string, terminated) = self.read_until_unescaped('"'); + Str { string, terminated } + } + + fn read_raw(&mut self) -> Token<'s> { + let mut backticks = 1; + while self.peek() == Some('`') { + self.eat(); + backticks += 1; + } + + let start = self.index; + + let mut found = 0; + while found < backticks { + match self.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, + } + } + + let terminated = found == backticks; + let end = self.index - if terminated { found } else { 0 }; + + Raw { + raw: &self.src[start .. end], + backticks, + terminated, + } + } + + fn read_until_unescaped(&mut self, end: char) -> (&'s str, bool) { + let mut escaped = false; + self.read_string_until(true, 0, -1, |c| { + match c { + c if c == end && !escaped => return true, + '\\' => escaped = !escaped, + _ => escaped = false, + } + + false + }) + } + + fn read_escaped(&mut self) -> Token<'s> { + fn is_escapable(c: char) -> bool { + match c { + '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => true, + _ => false, + } + } + + match self.peek() { + Some('u') => { + self.eat(); + if self.peek() == Some('{') { + self.eat(); + let (sequence, _) = + self.read_string_until(false, 0, 0, |c| !c.is_ascii_hexdigit()); + + let terminated = self.peek() == Some('}'); + if terminated { + self.eat(); + } + + UnicodeEscape { sequence, terminated } + } else { + Text("\\u") + } + } + Some(c) if is_escapable(c) => { + let index = self.index; + self.eat(); + Text(&self.src[index .. index + c.len_utf8()]) + } + Some(c) if c.is_whitespace() => Backslash, + Some(_) => Text("\\"), + None => Backslash, + } + } + + fn read_hex(&mut self) -> Token<'s> { + // This will parse more than the permissable 0-9, a-f, A-F character + // ranges to provide nicer error messages later. + Hex(self.read_string_until(false, 0, 0, |n| !n.is_ascii_alphanumeric()).0) + } + + fn read_expr(&mut self, text: &'s str) -> Token<'s> { + if let Ok(b) = text.parse::() { + Bool(b) + } else if let Ok(num) = text.parse::() { + Number(num) + } else if let Some(num) = parse_percentage(text) { + Number(num / 100.0) + } else if let Ok(length) = text.parse::() { + Length(length) + } else if is_identifier(text) { + Ident(text) + } else { + Invalid(text) + } + } + + /// Will read the input stream until `f` evaluates to `true`. When + /// `eat_match` is true, the token for which `f` was true is consumed. + /// Returns the string from the index where this was called offset by + /// `offset_start` to the end offset by `offset_end`. The end is before or + /// after the match depending on `eat_match`. + fn read_string_until( + &mut self, + eat_match: bool, + offset_start: isize, + offset_end: isize, + mut f: impl FnMut(char) -> bool, + ) -> (&'s str, bool) { + let start = ((self.index as isize) + offset_start) as usize; + let mut matched = false; + + while let Some(c) = self.peek() { + if f(c) { + matched = true; + if eat_match { + self.eat(); + } + break; + } + + self.eat(); + } + + let mut end = self.index; + if matched { + end = ((end as isize) + offset_end) as usize; + } + + (&self.src[start .. end], matched) + } + + fn eat(&mut self) -> Option { + let c = self.iter.next()?; + self.index += c.len_utf8(); + Some(c) + } + + fn peek(&mut self) -> Option { + self.iter.peek().copied() + } +} + +fn parse_percentage(text: &str) -> Option { + if text.ends_with('%') { + text[.. text.len() - 1].parse::().ok() + } else { + None + } +} + +/// Whether this character denotes a newline. +pub fn is_newline_char(character: char) -> bool { + match character { + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\x0A' ..= '\x0D' => true, + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} + +/// Whether this word is a valid identifier. +pub fn is_identifier(string: &str) -> bool { + fn is_extra_allowed(c: char) -> bool { + c == '.' || c == '-' || c == '_' + } + + let mut chars = string.chars(); + match chars.next() { + Some(c) if UnicodeXID::is_xid_start(c) || is_extra_allowed(c) => {} + _ => return false, + } + + for c in chars { + match c { + c if UnicodeXID::is_xid_continue(c) || is_extra_allowed(c) => {} + _ => return false, + } + } + + true +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod tests { + use super::*; + use crate::length::Length; + use crate::parse::tests::{check, s}; + + use Token::{ + BlockComment as BC, Bool, Chain, Hex, Hyphen as Min, Ident as Id, + LeftBrace as LB, LeftBracket as L, LeftParen as LP, Length as Len, + LineComment as LC, Number as Num, Plus, RightBrace as RB, RightBracket as R, + RightParen as RP, Slash, Space as S, Star, Text as T, + }; + + fn Str(string: &str, terminated: bool) -> Token { + Token::Str { string, terminated } + } + fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { + Token::Raw { raw, backticks, terminated } + } + fn UE(sequence: &str, terminated: bool) -> Token { + Token::UnicodeEscape { sequence, terminated } + } + + macro_rules! t { ($($tts:tt)*) => {test!(@spans=false, $($tts)*)} } + macro_rules! ts { ($($tts:tt)*) => {test!(@spans=true, $($tts)*)} } + macro_rules! test { + (@spans=$spans:expr, $mode:expr, $src:expr => $($token:expr),*) => { + let exp = vec![$(Into::>::into($token)),*]; + let found = Tokens::new($src, $mode).collect::>(); + check($src, exp, found, $spans); + } + } + + #[test] + fn tokenize_whitespace() { + t!(Body, "" => ); + t!(Body, " " => S(0)); + t!(Body, " " => S(0)); + t!(Body, "\t" => S(0)); + t!(Body, " \t" => S(0)); + t!(Body, "\n" => S(1)); + t!(Body, "\n " => S(1)); + t!(Body, " \n" => S(1)); + t!(Body, " \n " => S(1)); + t!(Body, "\r\n" => S(1)); + t!(Body, " \n\t \n " => S(2)); + t!(Body, "\n\r" => S(2)); + t!(Body, " \r\r\n \x0D" => S(3)); + t!(Body, "a~b" => T("a"), T("\u{00A0}"), T("b")); + } + + #[test] + fn tokenize_comments() { + t!(Body, "a // bc\n " => T("a"), S(0), LC(" bc"), S(1)); + t!(Body, "a //a//b\n " => T("a"), S(0), LC("a//b"), S(1)); + t!(Body, "a //a//b\r\n" => T("a"), S(0), LC("a//b"), S(1)); + t!(Body, "a //a//b\n\nhello" => T("a"), S(0), LC("a//b"), S(2), T("hello")); + t!(Body, "/**/" => BC("")); + t!(Body, "_/*_/*a*/*/" => Underscore, BC("_/*a*/")); + t!(Body, "/*/*/" => BC("/*/")); + t!(Body, "abc*/" => T("abc"), Invalid("*/")); + t!(Body, "/***/" => BC("*")); + t!(Body, "/**\\****/*/*/" => BC("*\\***"), Invalid("*/"), Invalid("*/")); + t!(Body, "/*abc" => BC("abc")); + } + + #[test] + fn tokenize_body_only_tokens() { + t!(Body, "_*" => Underscore, Star); + t!(Body, "***" => Star, Star, Star); + t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star); + t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there")); + t!(Body, "# hi" => Hashtag, S(0), T("hi")); + t!(Body, "#()" => Hashtag, T("()")); + t!(Header, "_`" => Invalid("_`")); + } + + #[test] + fn test_tokenize_raw() { + // Basics. + t!(Body, "`raw`" => Raw("raw", 1, true)); + t!(Body, "`[func]`" => Raw("[func]", 1, true)); + t!(Body, "`]" => Raw("]", 1, false)); + t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); + + // Language tag. + t!(Body, "``` hi```" => Raw(" hi", 3, true)); + t!(Body, "```rust hi```" => Raw("rust hi", 3, true)); + t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false)); + t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true)); + t!(Body, "```js \r\n document.write(\"go\")`" + => Raw("js \r\n document.write(\"go\")`", 3, false)); + + // More backticks. + t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi")); + t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true)); + } + + #[test] + fn tokenize_header_only_tokens() { + t!(Body, "a: b" => T("a:"), S(0), T("b")); + t!(Body, "c=d, " => T("c=d,"), S(0)); + t!(Header, "(){}:=," => LP, RP, LB, RB, Colon, Equals, Comma); + t!(Header, "a:b" => Id("a"), Colon, Id("b")); + t!(Header, "#6ae6dd" => Hex("6ae6dd")); + t!(Header, "#8A083c" => Hex("8A083c")); + t!(Header, "a: true, x=1" => Id("a"), Colon, S(0), Bool(true), Comma, S(0), + Id("x"), Equals, Num(1.0)); + t!(Header, "=3.14" => Equals, Num(3.14)); + t!(Header, "12.3e5" => Num(12.3e5)); + t!(Header, "120%" => Num(1.2)); + t!(Header, "12e4%" => Num(1200.0)); + t!(Header, "__main__" => Id("__main__")); + t!(Header, ">main" => Invalid(">main")); + t!(Header, ".func.box" => Id(".func.box")); + t!(Header, "arg, _b, _1" => Id("arg"), Comma, S(0), Id("_b"), Comma, S(0), Id("_1")); + t!(Header, "f: arg >> g" => Id("f"), Colon, S(0), Id("arg"), S(0), Chain, S(0), Id("g")); + t!(Header, "12_pt, 12pt" => Invalid("12_pt"), Comma, S(0), Len(Length::pt(12.0))); + t!(Header, "1e5in" => Len(Length::inches(100000.0))); + t!(Header, "2.3cm" => Len(Length::cm(2.3))); + t!(Header, "12e-3in" => Len(Length::inches(12e-3))); + t!(Header, "6.1cm + 4pt,a=1*2" => Len(Length::cm(6.1)), S(0), Plus, S(0), Len(Length::pt(4.0)), + Comma, Id("a"), Equals, Num(1.0), Star, Num(2.0)); + t!(Header, "(5 - 1) / 2.1" => LP, Num(5.0), S(0), Min, S(0), Num(1.0), RP, + S(0), Slash, S(0), Num(2.1)); + t!(Header, "-1" => Min, Num(1.0)); + t!(Header, "--1" => Min, Min, Num(1.0)); + t!(Header, "- 1" => Min, S(0), Num(1.0)); + t!(Header, "02.4mm" => Len(Length::mm(2.4))); + t!(Header, "2.4.cm" => Invalid("2.4.cm")); + t!(Header, "(1,2)" => LP, Num(1.0), Comma, Num(2.0), RP); + t!(Header, "{abc}" => LB, Id("abc"), RB); + t!(Header, "🌓, 🌍," => Invalid("🌓"), Comma, S(0), Invalid("🌍"), Comma); + } + + #[test] + fn tokenize_strings() { + t!(Body, "a \"hi\" string" => T("a"), S(0), T("\"hi\""), S(0), T("string")); + t!(Header, "\"hello" => Str("hello", false)); + t!(Header, "\"hello world\"" => Str("hello world", true)); + t!(Header, "\"hello\nworld\"" => Str("hello\nworld", true)); + t!(Header, r#"1"hello\nworld"false"# => Num(1.0), Str("hello\\nworld", true), Bool(false)); + t!(Header, r#""a\"bc""# => Str(r#"a\"bc"#, true)); + t!(Header, r#""a\\"bc""# => Str(r#"a\\"#, true), Id("bc"), Str("", false)); + t!(Header, r#""a\tbc"# => Str("a\\tbc", false)); + t!(Header, "\"🌎\"" => Str("🌎", true)); + } + + #[test] + fn tokenize_escaped_symbols() { + t!(Body, r"\\" => T(r"\")); + t!(Body, r"\[" => T("[")); + t!(Body, r"\]" => T("]")); + t!(Body, r"\*" => T("*")); + t!(Body, r"\_" => T("_")); + t!(Body, r"\`" => T("`")); + t!(Body, r"\/" => T("/")); + t!(Body, r"\u{2603}" => UE("2603", true)); + t!(Body, r"\u{26A4" => UE("26A4", false)); + t!(Body, r#"\""# => T("\"")); + } + + #[test] + fn tokenize_unescapable_symbols() { + t!(Body, r"\a" => T("\\"), T("a")); + t!(Body, r"\:" => T(r"\"), T(":")); + t!(Body, r"\=" => T(r"\"), T("=")); + t!(Body, r"\u{2GA4" => UE("2", false), T("GA4")); + t!(Body, r"\u{ " => UE("", false), Space(0)); + t!(Body, r"\u" => T(r"\u")); + t!(Header, r"\\\\" => Invalid(r"\\\\")); + t!(Header, r"\a" => Invalid(r"\a")); + t!(Header, r"\:" => Invalid(r"\"), Colon); + t!(Header, r"\=" => Invalid(r"\"), Equals); + t!(Header, r"\," => Invalid(r"\"), Comma); + } + + #[test] + fn tokenize_with_spans() { + ts!(Body, "hello" => s(0, 5, T("hello"))); + ts!(Body, "ab\r\nc" => s(0, 2, T("ab")), s(2, 4, S(1)), s(4, 5, T("c"))); + ts!(Body, "// ab\r\n\nf" => s(0, 5, LC(" ab")), s(5, 8, S(2)), s(8, 9, T("f"))); + ts!(Body, "/*b*/_" => s(0, 5, BC("b")), s(5, 6, Underscore)); + ts!(Header, "a=10" => s(0, 1, Id("a")), s(1, 2, Equals), s(2, 4, Num(10.0))); + } +} -- cgit v1.2.3