From ab03f3224078f1b8ca05bc1b65a7df4bebb5f449 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Sat, 14 Jan 2023 12:34:17 +0100 Subject: Reorganize syntax module --- src/syntax/incremental.rs | 16 +- src/syntax/kind.rs | 2 +- src/syntax/lexer.rs | 769 ++++++++++++++++++++++++++++++++++++++++++++++ src/syntax/linked.rs | 274 ----------------- src/syntax/mod.rs | 9 +- src/syntax/node.rs | 272 +++++++++++++++- src/syntax/parser.rs | 26 +- src/syntax/parsing.rs | 14 +- src/syntax/tests.rs | 483 ----------------------------- src/syntax/tokens.rs | 769 ---------------------------------------------- 10 files changed, 1078 insertions(+), 1556 deletions(-) create mode 100644 src/syntax/lexer.rs delete mode 100644 src/syntax/tests.rs delete mode 100644 src/syntax/tokens.rs (limited to 'src/syntax') diff --git a/src/syntax/incremental.rs b/src/syntax/incremental.rs index d5dea9d0..606daa2e 100644 --- a/src/syntax/incremental.rs +++ b/src/syntax/incremental.rs @@ -404,9 +404,23 @@ fn next_at_start(kind: &SyntaxKind, prev: bool) -> bool { #[cfg(test)] #[rustfmt::skip] mod tests { + use std::fmt::Debug; + use super::*; use super::super::{parse, Source}; - use super::super::tests::check; + + #[track_caller] + fn check(text: &str, found: T, expected: T) + where + T: Debug + PartialEq, + { + if found != expected { + println!("source: {text:?}"); + println!("expected: {expected:#?}"); + println!("found: {found:#?}"); + panic!("test failed"); + } + } #[track_caller] fn test(prev: &str, range: Range, with: &str, goal: Range) { diff --git a/src/syntax/kind.rs b/src/syntax/kind.rs index 54d5c81d..55f4b3ad 100644 --- a/src/syntax/kind.rs +++ b/src/syntax/kind.rs @@ -7,7 +7,7 @@ use crate::util::EcoString; /// All syntactical building blocks that can be part of a Typst document. /// -/// Can be created by the tokenizer or by the parser. +/// Can be created by the lexer or by the parser. #[derive(Debug, Clone, PartialEq)] pub enum SyntaxKind { /// A line comment: `// ...`. diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs new file mode 100644 index 00000000..d5476774 --- /dev/null +++ b/src/syntax/lexer.rs @@ -0,0 +1,769 @@ +use std::num::NonZeroUsize; +use std::sync::Arc; + +use unicode_xid::UnicodeXID; +use unscanny::Scanner; + +use super::resolve::{resolve_hex, resolve_raw, resolve_string}; +use super::{ErrorPos, RawFields, SyntaxKind, Unit}; +use crate::geom::{AbsUnit, AngleUnit}; +use crate::util::{format_eco, EcoString}; + +/// Splits up a string of source code into tokens. +#[derive(Clone)] +pub struct Lexer<'s> { + /// The underlying scanner. + s: Scanner<'s>, + /// The mode the lexer is in. This determines what tokens it recognizes. + mode: LexMode, + /// Whether the last token has been terminated. + terminated: bool, + /// Offsets the indentation on the first line of the source. + column_offset: usize, +} + +/// What kind of tokens to emit. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum LexMode { + /// Text and markup. + Markup, + /// Math atoms, operators, etc. + Math, + /// Keywords, literals and operators. + Code, +} + +impl<'s> Lexer<'s> { + /// Create a new lexer with the given mode. + pub fn new(text: &'s str, mode: LexMode) -> Self { + Self::with_prefix("", text, mode) + } + + /// Create a new lexer with the given mode and a prefix to offset column + /// calculations. + pub fn with_prefix(prefix: &str, text: &'s str, mode: LexMode) -> Self { + Self { + s: Scanner::new(text), + mode, + terminated: true, + column_offset: column(prefix, prefix.len(), 0), + } + } + + /// Get the current lexing mode. + pub fn mode(&self) -> LexMode { + self.mode + } + + /// Change the lexing mode. + pub fn set_mode(&mut self, mode: LexMode) { + self.mode = mode; + } + + /// The index in the string at which the last token ends and next token + /// will start. + pub fn cursor(&self) -> usize { + self.s.cursor() + } + + /// Jump to the given index in the string. + pub fn jump(&mut self, index: usize) { + self.s.jump(index); + } + + /// The underlying scanner. + pub fn scanner(&self) -> Scanner<'s> { + self.s + } + + /// Whether the last token was terminated. + pub fn terminated(&self) -> bool { + self.terminated + } + + /// The column index of a given index in the source string. + pub fn column(&self, index: usize) -> usize { + column(self.s.string(), index, self.column_offset) + } +} + +impl Iterator for Lexer<'_> { + type Item = SyntaxKind; + + /// Produce the next token. + fn next(&mut self) -> Option { + let start = self.s.cursor(); + let c = self.s.eat()?; + Some(match c { + // Trivia. + '/' if self.s.eat_if('/') => self.line_comment(), + '/' if self.s.eat_if('*') => self.block_comment(), + '*' if self.s.eat_if('/') => SyntaxKind::Error( + ErrorPos::Full, + "unexpected end of block comment".into(), + ), + c if c.is_whitespace() => self.whitespace(c), + + // Other things. + _ => match self.mode { + LexMode::Markup => self.markup(start, c), + LexMode::Math => self.math(start, c), + LexMode::Code => self.code(start, c), + }, + }) + } +} + +/// Shared. +impl Lexer<'_> { + fn line_comment(&mut self) -> SyntaxKind { + self.s.eat_until(is_newline); + if self.s.peek().is_none() { + self.terminated = false; + } + SyntaxKind::LineComment + } + + fn block_comment(&mut self) -> SyntaxKind { + let mut state = '_'; + let mut depth = 1; + self.terminated = false; + + // Find the first `*/` that does not correspond to a nested `/*`. + while let Some(c) = self.s.eat() { + state = match (state, c) { + ('*', '/') => { + depth -= 1; + if depth == 0 { + self.terminated = true; + break; + } + '_' + } + ('/', '*') => { + depth += 1; + '_' + } + ('/', '/') => { + self.line_comment(); + '_' + } + _ => c, + } + } + + SyntaxKind::BlockComment + } + + fn whitespace(&mut self, c: char) -> SyntaxKind { + if c == ' ' && !self.s.at(char::is_whitespace) { + return SyntaxKind::Space { newlines: 0 }; + } + + self.s.uneat(); + + // Count the number of newlines. + let mut newlines = 0; + while let Some(c) = self.s.eat() { + if !c.is_whitespace() { + self.s.uneat(); + break; + } + + if is_newline(c) { + if c == '\r' { + self.s.eat_if('\n'); + } + newlines += 1; + } + } + + SyntaxKind::Space { newlines } + } +} + +impl Lexer<'_> { + fn markup(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + // Blocks. + '{' => SyntaxKind::LeftBrace, + '}' => SyntaxKind::RightBrace, + '[' => SyntaxKind::LeftBracket, + ']' => SyntaxKind::RightBracket, + + // Multi-char things. + '#' => self.hash(start), + '.' if self.s.eat_if("..") => SyntaxKind::Shorthand('\u{2026}'), + '-' => self.hyph(), + ':' => self.colon(), + 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { + self.link(start) + } + '`' => self.raw(), + c if c.is_ascii_digit() => self.numbering(start), + '<' if self.s.at(is_id_continue) => self.label(), + '@' if self.s.at(is_id_continue) => self.reference(), + + // Escape sequences. + '\\' => self.backslash(), + + // Single-char things. + '~' => SyntaxKind::Shorthand('\u{00A0}'), + '\'' => SyntaxKind::SmartQuote { double: false }, + '"' => SyntaxKind::SmartQuote { double: true }, + '*' if !self.in_word() => SyntaxKind::Star, + '_' if !self.in_word() => SyntaxKind::Underscore, + '$' => SyntaxKind::Dollar, + '=' => SyntaxKind::Eq, + '+' => SyntaxKind::Plus, + '/' => SyntaxKind::Slash, + + // Plain text. + _ => self.text(start), + } + } + + fn text(&mut self, start: usize) -> SyntaxKind { + macro_rules! table { + ($(|$c:literal)*) => {{ + let mut t = [false; 128]; + $(t[$c as usize] = true;)* + t + }} + } + + const TABLE: [bool; 128] = table! { + | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' + | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' + | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' + }; + + loop { + self.s.eat_until(|c: char| { + TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) + }); + + // Continue with the same text node if the thing would become text + // anyway. + let mut s = self.s; + match s.eat() { + Some('/') if !s.at(['/', '*']) => {} + Some(' ') if s.at(char::is_alphanumeric) => {} + Some('-') if !s.at(['-', '?']) => {} + Some('.') if !s.at("..") => {} + Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} + Some('@' | '#') if !s.at(is_id_start) => {} + _ => break, + } + + self.s = s; + } + + SyntaxKind::Text(self.s.from(start).into()) + } + + fn backslash(&mut self) -> SyntaxKind { + match self.s.peek() { + Some('u') if self.s.eat_if("u{") => { + let sequence = self.s.eat_while(char::is_ascii_alphanumeric); + if self.s.eat_if('}') { + if let Some(c) = resolve_hex(sequence) { + SyntaxKind::Escape(c) + } else { + SyntaxKind::Error( + ErrorPos::Full, + "invalid unicode escape sequence".into(), + ) + } + } else { + self.terminated = false; + SyntaxKind::Error(ErrorPos::End, "expected closing brace".into()) + } + } + + // Linebreaks. + Some(c) if c.is_whitespace() => SyntaxKind::Linebreak, + None => SyntaxKind::Linebreak, + + // Escapes. + Some(c) => { + self.s.expect(c); + SyntaxKind::Escape(c) + } + } + } + + fn hash(&mut self, start: usize) -> SyntaxKind { + if self.s.eat_if('{') { + SyntaxKind::LeftBrace + } else if self.s.eat_if('[') { + SyntaxKind::LeftBracket + } else if self.s.at(is_id_start) { + let read = self.s.eat_while(is_id_continue); + match keyword(read) { + Some(keyword) => keyword, + None => SyntaxKind::Ident(read.into()), + } + } else if self.mode == LexMode::Markup { + self.text(start) + } else { + SyntaxKind::Atom("#".into()) + } + } + + fn hyph(&mut self) -> SyntaxKind { + if self.s.eat_if('-') { + if self.s.eat_if('-') { + SyntaxKind::Shorthand('\u{2014}') + } else { + SyntaxKind::Shorthand('\u{2013}') + } + } else if self.s.eat_if('?') { + SyntaxKind::Shorthand('\u{00AD}') + } else { + SyntaxKind::Minus + } + } + + fn colon(&mut self) -> SyntaxKind { + let start = self.s.cursor(); + let mut end = start; + while !self.s.eat_while(char::is_ascii_alphanumeric).is_empty() && self.s.at(':') + { + end = self.s.cursor(); + self.s.eat(); + } + + self.s.jump(end); + + if start < end { + self.s.expect(':'); + SyntaxKind::Symbol(self.s.get(start..end).into()) + } else if self.mode == LexMode::Markup { + SyntaxKind::Colon + } else { + SyntaxKind::Atom(":".into()) + } + } + + fn link(&mut self, start: usize) -> SyntaxKind { + #[rustfmt::skip] + self.s.eat_while(|c: char| matches!(c, + | '0' ..= '9' + | 'a' ..= 'z' + | 'A' ..= 'Z' + | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' + | '\'' | '.' | ',' | ';' + )); + if self.s.scout(-1) == Some('.') { + self.s.uneat(); + } + SyntaxKind::Link(self.s.from(start).into()) + } + + fn raw(&mut self) -> SyntaxKind { + let column = self.column(self.s.cursor() - 1); + + let mut backticks = 1; + while self.s.eat_if('`') { + backticks += 1; + } + + // Special case for empty inline block. + if backticks == 2 { + return SyntaxKind::Raw(Arc::new(RawFields { + text: EcoString::new(), + lang: None, + block: false, + })); + } + + let start = self.s.cursor(); + let mut found = 0; + while found < backticks { + match self.s.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, + } + } + + if found == backticks { + let end = self.s.cursor() - found as usize; + SyntaxKind::Raw(Arc::new(resolve_raw( + column, + backticks, + self.s.get(start..end), + ))) + } else { + self.terminated = false; + let remaining = backticks - found; + let noun = if remaining == 1 { "backtick" } else { "backticks" }; + SyntaxKind::Error( + ErrorPos::End, + if found == 0 { + format_eco!("expected {} {}", remaining, noun) + } else { + format_eco!("expected {} more {}", remaining, noun) + }, + ) + } + } + + fn numbering(&mut self, start: usize) -> SyntaxKind { + self.s.eat_while(char::is_ascii_digit); + let read = self.s.from(start); + if self.s.eat_if('.') { + if let Ok(number) = read.parse::() { + return match NonZeroUsize::new(number) { + Some(number) => SyntaxKind::EnumNumbering(number), + None => SyntaxKind::Error(ErrorPos::Full, "must be positive".into()), + }; + } + } + + self.text(start) + } + + fn reference(&mut self) -> SyntaxKind { + SyntaxKind::Ref(self.s.eat_while(is_id_continue).into()) + } + + fn in_word(&self) -> bool { + let alphanumeric = |c: Option| c.map_or(false, |c| c.is_alphanumeric()); + let prev = self.s.scout(-2); + let next = self.s.peek(); + alphanumeric(prev) && alphanumeric(next) + } +} + +/// Math. +impl Lexer<'_> { + fn math(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + // Symbol shorthands. + '|' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{21A6}'), + '<' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{2194}'), + '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand('\u{21D4}'), + '!' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2260}'), + '<' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2264}'), + '>' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2265}'), + '<' if self.s.eat_if('-') => SyntaxKind::Shorthand('\u{2190}'), + '-' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{2192}'), + '=' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{21D2}'), + ':' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2254}'), + + // Multi-char things. + '#' => self.hash(start), + + // Escape sequences. + '\\' => self.backslash(), + + // Single-char things. + '_' => SyntaxKind::Underscore, + '^' => SyntaxKind::Hat, + '/' => SyntaxKind::Slash, + '&' => SyntaxKind::Amp, + '$' => SyntaxKind::Dollar, + + // Symbol notation. + ':' => self.colon(), + + // Strings. + '"' => self.string(), + + // Identifiers and symbol notation. + c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { + self.s.eat_while(is_math_id_continue); + + let mut symbol = false; + while self.s.eat_if(':') + && !self.s.eat_while(char::is_alphanumeric).is_empty() + { + symbol = true; + } + + if symbol { + SyntaxKind::Symbol(self.s.from(start).into()) + } else { + if self.s.scout(-1) == Some(':') { + self.s.uneat(); + } + + SyntaxKind::Ident(self.s.from(start).into()) + } + } + + // Numbers. + c if c.is_numeric() => { + self.s.eat_while(char::is_numeric); + SyntaxKind::Atom(self.s.from(start).into()) + } + + // Other math atoms. + c => SyntaxKind::Atom(c.into()), + } + } +} + +/// Code. +impl Lexer<'_> { + fn code(&mut self, start: usize, c: char) -> SyntaxKind { + match c { + // Blocks. + '{' => SyntaxKind::LeftBrace, + '}' => SyntaxKind::RightBrace, + '[' => SyntaxKind::LeftBracket, + ']' => SyntaxKind::RightBracket, + + // Parentheses. + '(' => SyntaxKind::LeftParen, + ')' => SyntaxKind::RightParen, + + // Math. + '$' => SyntaxKind::Dollar, + + // Labels and raw. + '<' if self.s.at(is_id_continue) => self.label(), + '`' => self.raw(), + + // Two-char operators. + '=' if self.s.eat_if('=') => SyntaxKind::EqEq, + '!' if self.s.eat_if('=') => SyntaxKind::ExclEq, + '<' if self.s.eat_if('=') => SyntaxKind::LtEq, + '>' if self.s.eat_if('=') => SyntaxKind::GtEq, + '+' if self.s.eat_if('=') => SyntaxKind::PlusEq, + '-' if self.s.eat_if('=') => SyntaxKind::HyphEq, + '*' if self.s.eat_if('=') => SyntaxKind::StarEq, + '/' if self.s.eat_if('=') => SyntaxKind::SlashEq, + '.' if self.s.eat_if('.') => SyntaxKind::Dots, + '=' if self.s.eat_if('>') => SyntaxKind::Arrow, + + // Single-char operators. + ',' => SyntaxKind::Comma, + ';' => SyntaxKind::Semicolon, + ':' => SyntaxKind::Colon, + '+' => SyntaxKind::Plus, + '-' => SyntaxKind::Minus, + '*' => SyntaxKind::Star, + '/' => SyntaxKind::Slash, + '=' => SyntaxKind::Eq, + '<' => SyntaxKind::Lt, + '>' => SyntaxKind::Gt, + '.' if !self.s.at(char::is_ascii_digit) => SyntaxKind::Dot, + + // Identifiers. + c if is_id_start(c) => self.ident(start), + + // Numbers. + c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => { + self.number(start, c) + } + + // Strings. + '"' => self.string(), + + // Invalid token. + _ => SyntaxKind::Error(ErrorPos::Full, "not valid here".into()), + } + } + + fn ident(&mut self, start: usize) -> SyntaxKind { + self.s.eat_while(is_id_continue); + match self.s.from(start) { + "none" => SyntaxKind::None, + "auto" => SyntaxKind::Auto, + "true" => SyntaxKind::Bool(true), + "false" => SyntaxKind::Bool(false), + id => keyword(id).unwrap_or_else(|| SyntaxKind::Ident(id.into())), + } + } + + fn number(&mut self, start: usize, c: char) -> SyntaxKind { + // Read the first part (integer or fractional depending on `first`). + self.s.eat_while(char::is_ascii_digit); + + // Read the fractional part if not already done. + // Make sure not to confuse a range for the decimal separator. + if c != '.' && !self.s.at("..") && self.s.eat_if('.') { + self.s.eat_while(char::is_ascii_digit); + } + + // Read the exponent. + if !self.s.at("em") && self.s.eat_if(['e', 'E']) { + self.s.eat_if(['+', '-']); + self.s.eat_while(char::is_ascii_digit); + } + + // Read the suffix. + let suffix_start = self.s.cursor(); + if !self.s.eat_if('%') { + self.s.eat_while(char::is_ascii_alphanumeric); + } + + let number = self.s.get(start..suffix_start); + let suffix = self.s.from(suffix_start); + + // Find out whether it is a simple number. + if suffix.is_empty() { + if let Ok(i) = number.parse::() { + return SyntaxKind::Int(i); + } + } + + let Ok(v) = number.parse::() else { + return SyntaxKind::Error(ErrorPos::Full, "invalid number".into()); + }; + + match suffix { + "" => SyntaxKind::Float(v), + "pt" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Pt)), + "mm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Mm)), + "cm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Cm)), + "in" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::In)), + "deg" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Deg)), + "rad" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Rad)), + "em" => SyntaxKind::Numeric(v, Unit::Em), + "fr" => SyntaxKind::Numeric(v, Unit::Fr), + "%" => SyntaxKind::Numeric(v, Unit::Percent), + _ => SyntaxKind::Error(ErrorPos::Full, "invalid number suffix".into()), + } + } + + fn string(&mut self) -> SyntaxKind { + let mut escaped = false; + let verbatim = self.s.eat_until(|c| { + if c == '"' && !escaped { + true + } else { + escaped = c == '\\' && !escaped; + false + } + }); + + let string = resolve_string(verbatim); + if self.s.eat_if('"') { + SyntaxKind::Str(string) + } else { + self.terminated = false; + SyntaxKind::Error(ErrorPos::End, "expected quote".into()) + } + } + + fn label(&mut self) -> SyntaxKind { + let label = self.s.eat_while(is_id_continue); + if self.s.eat_if('>') { + if !label.is_empty() { + SyntaxKind::Label(label.into()) + } else { + SyntaxKind::Error(ErrorPos::Full, "label cannot be empty".into()) + } + } else { + self.terminated = false; + SyntaxKind::Error(ErrorPos::End, "expected closing angle bracket".into()) + } + } +} + +/// Try to parse an identifier into a keyword. +fn keyword(ident: &str) -> Option { + Some(match ident { + "not" => SyntaxKind::Not, + "and" => SyntaxKind::And, + "or" => SyntaxKind::Or, + "let" => SyntaxKind::Let, + "set" => SyntaxKind::Set, + "show" => SyntaxKind::Show, + "if" => SyntaxKind::If, + "else" => SyntaxKind::Else, + "for" => SyntaxKind::For, + "in" => SyntaxKind::In, + "while" => SyntaxKind::While, + "break" => SyntaxKind::Break, + "continue" => SyntaxKind::Continue, + "return" => SyntaxKind::Return, + "import" => SyntaxKind::Import, + "include" => SyntaxKind::Include, + "as" => SyntaxKind::As, + _ => return None, + }) +} + +/// The column index of a given index in the source string, given a column +/// offset for the first line. +fn column(string: &str, index: usize, offset: usize) -> usize { + let mut apply_offset = false; + let res = string[..index] + .char_indices() + .rev() + .take_while(|&(_, c)| !is_newline(c)) + .inspect(|&(i, _)| { + if i == 0 { + apply_offset = true + } + }) + .count(); + + // The loop is never executed if the slice is empty, but we are of + // course still at the start of the first line. + if index == 0 { + apply_offset = true; + } + + if apply_offset { + res + offset + } else { + res + } +} + +/// Whether this character denotes a newline. +#[inline] +pub fn is_newline(character: char) -> bool { + matches!( + character, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' + ) +} + +/// Whether a string is a valid unicode identifier. +/// +/// In addition to what is specified in the [Unicode Standard][uax31], we allow: +/// - `_` as a starting character, +/// - `_` and `-` as continuing characters. +/// +/// [uax31]: http://www.unicode.org/reports/tr31/ +#[inline] +pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + chars + .next() + .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) +} + +/// Whether a character can start an identifier. +#[inline] +fn is_id_start(c: char) -> bool { + c.is_xid_start() || c == '_' +} + +/// Whether a character can continue an identifier. +#[inline] +fn is_id_continue(c: char) -> bool { + c.is_xid_continue() || c == '_' || c == '-' +} + +/// Whether a character can start an identifier in math. +#[inline] +fn is_math_id_start(c: char) -> bool { + c.is_xid_start() +} + +/// Whether a character can continue an identifier in math. +#[inline] +fn is_math_id_continue(c: char) -> bool { + c.is_xid_continue() && c != '_' +} diff --git a/src/syntax/linked.rs b/src/syntax/linked.rs index 2826835e..e69de29b 100644 --- a/src/syntax/linked.rs +++ b/src/syntax/linked.rs @@ -1,274 +0,0 @@ -use std::fmt::{self, Debug, Formatter}; -use std::ops::{Deref, Range}; -use std::rc::Rc; - -use super::{SyntaxKind, SyntaxNode}; - -/// A syntax node in a context. -/// -/// Knows its exact offset in the file and provides access to its -/// children, parent and siblings. -/// -/// **Note that all sibling and leaf accessors skip over trivia!** -#[derive(Clone)] -pub struct LinkedNode<'a> { - node: &'a SyntaxNode, - parent: Option>, - index: usize, - offset: usize, -} - -impl<'a> LinkedNode<'a> { - /// Start a new traversal at the source's root node. - pub fn new(root: &'a SyntaxNode) -> Self { - Self { node: root, parent: None, index: 0, offset: 0 } - } - - /// Get the contained syntax node. - pub fn get(&self) -> &'a SyntaxNode { - self.node - } - - /// The index of this node in its parent's children list. - pub fn index(&self) -> usize { - self.index - } - - /// The absolute byte offset of the this node in the source file. - pub fn offset(&self) -> usize { - self.offset - } - - /// The byte range of the this node in the source file. - pub fn range(&self) -> Range { - self.offset..self.offset + self.node.len() - } - - /// Get this node's children. - pub fn children(&self) -> LinkedChildren<'a> { - LinkedChildren { - parent: Rc::new(self.clone()), - iter: self.node.children().enumerate(), - front: self.offset, - back: self.offset + self.len(), - } - } -} - -/// Access to parents and siblings. -impl<'a> LinkedNode<'a> { - /// Get this node's parent. - pub fn parent(&self) -> Option<&Self> { - self.parent.as_deref() - } - - /// Get the kind of this node's parent. - pub fn parent_kind(&self) -> Option<&'a SyntaxKind> { - Some(self.parent()?.node.kind()) - } - - /// Get the first previous non-trivia sibling node. - pub fn prev_sibling(&self) -> Option { - let parent = self.parent()?; - let index = self.index.checked_sub(1)?; - let node = parent.node.children().nth(index)?; - let offset = self.offset - node.len(); - let prev = Self { node, parent: self.parent.clone(), index, offset }; - if prev.kind().is_trivia() { - prev.prev_sibling() - } else { - Some(prev) - } - } - - /// Get the next non-trivia sibling node. - pub fn next_sibling(&self) -> Option { - let parent = self.parent()?; - let index = self.index.checked_add(1)?; - let node = parent.node.children().nth(index)?; - let offset = self.offset + self.node.len(); - let next = Self { node, parent: self.parent.clone(), index, offset }; - if next.kind().is_trivia() { - next.next_sibling() - } else { - Some(next) - } - } -} - -/// Access to leafs. -impl<'a> LinkedNode<'a> { - /// Get the rightmost non-trivia leaf before this node. - pub fn prev_leaf(&self) -> Option { - let mut node = self.clone(); - while let Some(prev) = node.prev_sibling() { - if let Some(leaf) = prev.rightmost_leaf() { - return Some(leaf); - } - node = prev; - } - self.parent()?.prev_leaf() - } - - /// Find the leftmost contained non-trivia leaf. - pub fn leftmost_leaf(&self) -> Option { - if self.is_leaf() && !self.kind().is_trivia() && !self.kind().is_error() { - return Some(self.clone()); - } - - for child in self.children() { - if let Some(leaf) = child.leftmost_leaf() { - return Some(leaf); - } - } - - None - } - - /// Get the leaf at the specified cursor position. - pub fn leaf_at(&self, cursor: usize) -> Option { - if self.node.children().len() == 0 && cursor <= self.offset + self.len() { - return Some(self.clone()); - } - - let mut offset = self.offset; - let count = self.node.children().len(); - for (i, child) in self.children().enumerate() { - let len = child.len(); - if (offset < cursor && cursor <= offset + len) - || (offset == cursor && i + 1 == count) - { - return child.leaf_at(cursor); - } - offset += len; - } - - None - } - - /// Find the rightmost contained non-trivia leaf. - pub fn rightmost_leaf(&self) -> Option { - if self.is_leaf() && !self.kind().is_trivia() { - return Some(self.clone()); - } - - for child in self.children().rev() { - if let Some(leaf) = child.rightmost_leaf() { - return Some(leaf); - } - } - - None - } - - /// Get the leftmost non-trivia leaf after this node. - pub fn next_leaf(&self) -> Option { - let mut node = self.clone(); - while let Some(next) = node.next_sibling() { - if let Some(leaf) = next.leftmost_leaf() { - return Some(leaf); - } - node = next; - } - self.parent()?.next_leaf() - } -} - -impl Deref for LinkedNode<'_> { - type Target = SyntaxNode; - - fn deref(&self) -> &Self::Target { - self.get() - } -} - -impl Debug for LinkedNode<'_> { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.node.fmt(f) - } -} - -/// An iterator over the children of a linked node. -pub struct LinkedChildren<'a> { - parent: Rc>, - iter: std::iter::Enumerate>, - front: usize, - back: usize, -} - -impl<'a> Iterator for LinkedChildren<'a> { - type Item = LinkedNode<'a>; - - fn next(&mut self) -> Option { - self.iter.next().map(|(index, node)| { - let offset = self.front; - self.front += node.len(); - LinkedNode { - node, - parent: Some(self.parent.clone()), - index, - offset, - } - }) - } - - fn size_hint(&self) -> (usize, Option) { - self.iter.size_hint() - } -} - -impl DoubleEndedIterator for LinkedChildren<'_> { - fn next_back(&mut self) -> Option { - self.iter.next_back().map(|(index, node)| { - self.back -= node.len(); - LinkedNode { - node, - parent: Some(self.parent.clone()), - index, - offset: self.back, - } - }) - } -} - -impl ExactSizeIterator for LinkedChildren<'_> {} - -#[cfg(test)] -mod tests { - use super::*; - use crate::syntax::Source; - - #[test] - fn test_linked_node() { - let source = Source::detached("#set text(12pt, red)"); - - // Find "text". - let node = LinkedNode::new(source.root()).leaf_at(7).unwrap(); - assert_eq!(node.offset(), 5); - assert_eq!(node.len(), 4); - assert_eq!(node.kind(), &SyntaxKind::Ident("text".into())); - - // Go back to "#set". Skips the space. - let prev = node.prev_sibling().unwrap(); - assert_eq!(prev.offset(), 0); - assert_eq!(prev.len(), 4); - assert_eq!(prev.kind(), &SyntaxKind::Set); - } - - #[test] - fn test_linked_node_non_trivia_leaf() { - let source = Source::detached("#set fun(12pt, red)"); - let leaf = LinkedNode::new(source.root()).leaf_at(6).unwrap(); - let prev = leaf.prev_leaf().unwrap(); - assert_eq!(leaf.kind(), &SyntaxKind::Ident("fun".into())); - assert_eq!(prev.kind(), &SyntaxKind::Set); - - let source = Source::detached("#let x = 10"); - let leaf = LinkedNode::new(source.root()).leaf_at(9).unwrap(); - let prev = leaf.prev_leaf().unwrap(); - let next = leaf.next_leaf().unwrap(); - assert_eq!(prev.kind(), &SyntaxKind::Eq); - assert_eq!(leaf.kind(), &SyntaxKind::Space { newlines: 0 }); - assert_eq!(next.kind(), &SyntaxKind::Int(10)); - } -} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 4f159b83..81524aa2 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -4,25 +4,20 @@ pub mod ast; mod incremental; mod kind; -mod linked; +mod lexer; mod node; mod parser; mod parsing; mod resolve; mod source; mod span; -mod tokens; pub use self::kind::*; -pub use self::linked::*; +pub use self::lexer::*; pub use self::node::*; pub use self::parsing::*; pub use self::source::*; pub use self::span::*; -pub use self::tokens::*; use incremental::reparse; use parser::*; - -#[cfg(test)] -mod tests; diff --git a/src/syntax/node.rs b/src/syntax/node.rs index 62c07ffb..13556ede 100644 --- a/src/syntax/node.rs +++ b/src/syntax/node.rs @@ -1,5 +1,6 @@ use std::fmt::{self, Debug, Display, Formatter}; -use std::ops::Range; +use std::ops::{Deref, Range}; +use std::rc::Rc; use std::sync::Arc; use super::ast::AstNode; @@ -526,6 +527,235 @@ impl PartialEq for NodeData { } } +/// A syntax node in a context. +/// +/// Knows its exact offset in the file and provides access to its +/// children, parent and siblings. +/// +/// **Note that all sibling and leaf accessors skip over trivia!** +#[derive(Clone)] +pub struct LinkedNode<'a> { + node: &'a SyntaxNode, + parent: Option>, + index: usize, + offset: usize, +} + +impl<'a> LinkedNode<'a> { + /// Start a new traversal at the source's root node. + pub fn new(root: &'a SyntaxNode) -> Self { + Self { node: root, parent: None, index: 0, offset: 0 } + } + + /// Get the contained syntax node. + pub fn get(&self) -> &'a SyntaxNode { + self.node + } + + /// The index of this node in its parent's children list. + pub fn index(&self) -> usize { + self.index + } + + /// The absolute byte offset of the this node in the source file. + pub fn offset(&self) -> usize { + self.offset + } + + /// The byte range of the this node in the source file. + pub fn range(&self) -> Range { + self.offset..self.offset + self.node.len() + } + + /// Get this node's children. + pub fn children(&self) -> LinkedChildren<'a> { + LinkedChildren { + parent: Rc::new(self.clone()), + iter: self.node.children().enumerate(), + front: self.offset, + back: self.offset + self.len(), + } + } +} + +/// Access to parents and siblings. +impl<'a> LinkedNode<'a> { + /// Get this node's parent. + pub fn parent(&self) -> Option<&Self> { + self.parent.as_deref() + } + + /// Get the kind of this node's parent. + pub fn parent_kind(&self) -> Option<&'a SyntaxKind> { + Some(self.parent()?.node.kind()) + } + + /// Get the first previous non-trivia sibling node. + pub fn prev_sibling(&self) -> Option { + let parent = self.parent()?; + let index = self.index.checked_sub(1)?; + let node = parent.node.children().nth(index)?; + let offset = self.offset - node.len(); + let prev = Self { node, parent: self.parent.clone(), index, offset }; + if prev.kind().is_trivia() { + prev.prev_sibling() + } else { + Some(prev) + } + } + + /// Get the next non-trivia sibling node. + pub fn next_sibling(&self) -> Option { + let parent = self.parent()?; + let index = self.index.checked_add(1)?; + let node = parent.node.children().nth(index)?; + let offset = self.offset + self.node.len(); + let next = Self { node, parent: self.parent.clone(), index, offset }; + if next.kind().is_trivia() { + next.next_sibling() + } else { + Some(next) + } + } +} + +/// Access to leafs. +impl<'a> LinkedNode<'a> { + /// Get the rightmost non-trivia leaf before this node. + pub fn prev_leaf(&self) -> Option { + let mut node = self.clone(); + while let Some(prev) = node.prev_sibling() { + if let Some(leaf) = prev.rightmost_leaf() { + return Some(leaf); + } + node = prev; + } + self.parent()?.prev_leaf() + } + + /// Find the leftmost contained non-trivia leaf. + pub fn leftmost_leaf(&self) -> Option { + if self.is_leaf() && !self.kind().is_trivia() && !self.kind().is_error() { + return Some(self.clone()); + } + + for child in self.children() { + if let Some(leaf) = child.leftmost_leaf() { + return Some(leaf); + } + } + + None + } + + /// Get the leaf at the specified cursor position. + pub fn leaf_at(&self, cursor: usize) -> Option { + if self.node.children().len() == 0 && cursor <= self.offset + self.len() { + return Some(self.clone()); + } + + let mut offset = self.offset; + let count = self.node.children().len(); + for (i, child) in self.children().enumerate() { + let len = child.len(); + if (offset < cursor && cursor <= offset + len) + || (offset == cursor && i + 1 == count) + { + return child.leaf_at(cursor); + } + offset += len; + } + + None + } + + /// Find the rightmost contained non-trivia leaf. + pub fn rightmost_leaf(&self) -> Option { + if self.is_leaf() && !self.kind().is_trivia() { + return Some(self.clone()); + } + + for child in self.children().rev() { + if let Some(leaf) = child.rightmost_leaf() { + return Some(leaf); + } + } + + None + } + + /// Get the leftmost non-trivia leaf after this node. + pub fn next_leaf(&self) -> Option { + let mut node = self.clone(); + while let Some(next) = node.next_sibling() { + if let Some(leaf) = next.leftmost_leaf() { + return Some(leaf); + } + node = next; + } + self.parent()?.next_leaf() + } +} + +impl Deref for LinkedNode<'_> { + type Target = SyntaxNode; + + fn deref(&self) -> &Self::Target { + self.get() + } +} + +impl Debug for LinkedNode<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + self.node.fmt(f) + } +} + +/// An iterator over the children of a linked node. +pub struct LinkedChildren<'a> { + parent: Rc>, + iter: std::iter::Enumerate>, + front: usize, + back: usize, +} + +impl<'a> Iterator for LinkedChildren<'a> { + type Item = LinkedNode<'a>; + + fn next(&mut self) -> Option { + self.iter.next().map(|(index, node)| { + let offset = self.front; + self.front += node.len(); + LinkedNode { + node, + parent: Some(self.parent.clone()), + index, + offset, + } + }) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +impl DoubleEndedIterator for LinkedChildren<'_> { + fn next_back(&mut self) -> Option { + self.iter.next_back().map(|(index, node)| { + self.back -= node.len(); + LinkedNode { + node, + parent: Some(self.parent.clone()), + index, + offset: self.back, + } + }) + } +} + +impl ExactSizeIterator for LinkedChildren<'_> {} + /// Result of numbering a node within an interval. pub(super) type NumberingResult = Result<(), Unnumberable>; @@ -540,3 +770,43 @@ impl Display for Unnumberable { } impl std::error::Error for Unnumberable {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::syntax::Source; + + #[test] + fn test_linked_node() { + let source = Source::detached("#set text(12pt, red)"); + + // Find "text". + let node = LinkedNode::new(source.root()).leaf_at(7).unwrap(); + assert_eq!(node.offset(), 5); + assert_eq!(node.len(), 4); + assert_eq!(node.kind(), &SyntaxKind::Ident("text".into())); + + // Go back to "#set". Skips the space. + let prev = node.prev_sibling().unwrap(); + assert_eq!(prev.offset(), 0); + assert_eq!(prev.len(), 4); + assert_eq!(prev.kind(), &SyntaxKind::Set); + } + + #[test] + fn test_linked_node_non_trivia_leaf() { + let source = Source::detached("#set fun(12pt, red)"); + let leaf = LinkedNode::new(source.root()).leaf_at(6).unwrap(); + let prev = leaf.prev_leaf().unwrap(); + assert_eq!(leaf.kind(), &SyntaxKind::Ident("fun".into())); + assert_eq!(prev.kind(), &SyntaxKind::Set); + + let source = Source::detached("#let x = 10"); + let leaf = LinkedNode::new(source.root()).leaf_at(9).unwrap(); + let prev = leaf.prev_leaf().unwrap(); + let next = leaf.next_leaf().unwrap(); + assert_eq!(prev.kind(), &SyntaxKind::Eq); + assert_eq!(leaf.kind(), &SyntaxKind::Space { newlines: 0 }); + assert_eq!(next.kind(), &SyntaxKind::Int(10)); + } +} diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs index 74be792f..d2ef6e0e 100644 --- a/src/syntax/parser.rs +++ b/src/syntax/parser.rs @@ -2,13 +2,13 @@ use std::fmt::{self, Display, Formatter}; use std::mem; use std::ops::Range; -use super::{ErrorPos, SyntaxKind, SyntaxNode, TokenMode, Tokens}; +use super::{ErrorPos, LexMode, Lexer, SyntaxKind, SyntaxNode}; use crate::util::{format_eco, EcoString}; /// A convenient token-based parser. pub struct Parser<'s> { /// An iterator over the source tokens. - tokens: Tokens<'s>, + tokens: Lexer<'s>, /// Whether we are at the end of the file or of a group. eof: bool, /// The current token. @@ -29,15 +29,15 @@ pub struct Parser<'s> { impl<'s> Parser<'s> { /// Create a new parser for the source string. - pub fn new(text: &'s str, mode: TokenMode) -> Self { + pub fn new(text: &'s str, mode: LexMode) -> Self { Self::with_prefix("", text, mode) } /// Create a new parser for the source string that is prefixed by some text /// that does not need to be parsed but taken into account for column /// calculation. - pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { - let mut tokens = Tokens::with_prefix(prefix, text, mode); + pub fn with_prefix(prefix: &str, text: &'s str, mode: LexMode) -> Self { + let mut tokens = Lexer::with_prefix(prefix, text, mode); let current = tokens.next(); Self { tokens, @@ -91,7 +91,7 @@ impl<'s> Parser<'s> { let until = self.trivia_start(); let mut children = mem::replace(&mut self.children, prev); - if self.tokens.mode() == TokenMode::Markup { + if self.tokens.mode() == LexMode::Markup { self.children.push(SyntaxNode::inner(kind, children)); } else { // Trailing trivia should not be wrapped into the new node. @@ -121,7 +121,7 @@ impl<'s> Parser<'s> { self.prev_end = self.tokens.cursor(); self.bump(); - if self.tokens.mode() != TokenMode::Markup { + if self.tokens.mode() != LexMode::Markup { // Skip whitespace and comments. while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) { self.bump(); @@ -235,9 +235,9 @@ impl<'s> Parser<'s> { pub fn start_group(&mut self, kind: Group) { self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() }); self.tokens.set_mode(match kind { - Group::Bracket | Group::Strong | Group::Emph => TokenMode::Markup, - Group::Math | Group::MathRow(_, _) => TokenMode::Math, - Group::Brace | Group::Paren | Group::Expr => TokenMode::Code, + Group::Bracket | Group::Strong | Group::Emph => LexMode::Markup, + Group::Math | Group::MathRow(_, _) => LexMode::Math, + Group::Brace | Group::Paren | Group::Expr => LexMode::Code, }); match kind { @@ -296,7 +296,7 @@ impl<'s> Parser<'s> { // Rescan the peeked token if the mode changed. if rescan { let mut target = self.prev_end(); - if group_mode != TokenMode::Markup { + if group_mode != LexMode::Markup { let start = self.trivia_start().0; target = self.current_start - self.children[start..].iter().map(SyntaxNode::len).sum::(); @@ -488,7 +488,7 @@ impl Marker { } // Don't expose trivia in code. - if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() { + if p.tokens.mode() != LexMode::Markup && child.kind().is_trivia() { continue; } @@ -515,7 +515,7 @@ struct GroupEntry { pub kind: Group, /// The mode the parser was in _before_ the group started (to which we go /// back once the group ends). - pub prev_mode: TokenMode, + pub prev_mode: LexMode, } /// A group, confined by optional start and end delimiters. diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs index 7f557fac..a6e6c861 100644 --- a/src/syntax/parsing.rs +++ b/src/syntax/parsing.rs @@ -2,21 +2,21 @@ use std::collections::HashSet; use super::ast::{Assoc, BinOp, UnOp}; use super::{ - ErrorPos, Group, Marker, ParseError, ParseResult, Parser, SyntaxKind, SyntaxNode, - TokenMode, + ErrorPos, Group, LexMode, Marker, ParseError, ParseResult, Parser, SyntaxKind, + SyntaxNode, }; use crate::util::EcoString; /// Parse a source file. pub fn parse(text: &str) -> SyntaxNode { - let mut p = Parser::new(text, TokenMode::Markup); + let mut p = Parser::new(text, LexMode::Markup); markup(&mut p, true); p.finish().into_iter().next().unwrap() } /// Parse code directly, only used for syntax highlighting. pub fn parse_code(text: &str) -> SyntaxNode { - let mut p = Parser::new(text, TokenMode::Code); + let mut p = Parser::new(text, LexMode::Code); p.perform(SyntaxKind::CodeBlock, code); p.finish().into_iter().next().unwrap() } @@ -29,7 +29,7 @@ pub(crate) fn reparse_code_block( text: &str, end_pos: usize, ) -> Option<(Vec, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); + let mut p = Parser::with_prefix(prefix, text, LexMode::Code); if !p.at(SyntaxKind::LeftBrace) { return None; } @@ -53,7 +53,7 @@ pub(crate) fn reparse_content_block( text: &str, end_pos: usize, ) -> Option<(Vec, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); + let mut p = Parser::with_prefix(prefix, text, LexMode::Code); if !p.at(SyntaxKind::LeftBracket) { return None; } @@ -81,7 +81,7 @@ pub(crate) fn reparse_markup_elements( mut at_start: bool, min_indent: usize, ) -> Option<(Vec, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Markup); + let mut p = Parser::with_prefix(prefix, text, LexMode::Markup); let mut node: Option<&SyntaxNode> = None; let mut iter = reference.iter(); diff --git a/src/syntax/tests.rs b/src/syntax/tests.rs deleted file mode 100644 index 7b5dd870..00000000 --- a/src/syntax/tests.rs +++ /dev/null @@ -1,483 +0,0 @@ -#![allow(non_snake_case)] - -use std::num::NonZeroUsize; -use std::sync::Arc; - -use super::*; -use crate::geom::{AbsUnit, AngleUnit}; - -use ErrorPos::*; -use Option::None; -use SyntaxKind::*; -use TokenMode::{Code, Markup}; - -use std::fmt::Debug; - -#[track_caller] -pub fn check(text: &str, found: T, expected: T) -where - T: Debug + PartialEq, -{ - if found != expected { - println!("source: {text:?}"); - println!("expected: {expected:#?}"); - println!("found: {found:#?}"); - panic!("test failed"); - } -} - -fn Space(newlines: usize) -> SyntaxKind { - SyntaxKind::Space { newlines } -} - -fn Raw(text: &str, lang: Option<&str>, block: bool) -> SyntaxKind { - SyntaxKind::Raw(Arc::new(RawFields { - text: text.into(), - lang: lang.map(Into::into), - block, - })) -} - -fn Str(string: &str) -> SyntaxKind { - SyntaxKind::Str(string.into()) -} - -fn Text(string: &str) -> SyntaxKind { - SyntaxKind::Text(string.into()) -} - -fn Ident(ident: &str) -> SyntaxKind { - SyntaxKind::Ident(ident.into()) -} - -fn Error(pos: ErrorPos, message: &str) -> SyntaxKind { - SyntaxKind::Error(pos, message.into()) -} - -/// Building blocks for suffix testing. -/// -/// We extend each test case with a collection of different suffixes to make -/// sure tokens end at the correct position. These suffixes are split into -/// blocks, which can be disabled/enabled per test case. For example, when -/// testing identifiers we disable letter suffixes because these would -/// mingle with the identifiers. -/// -/// Suffix blocks: -/// - ' ': spacing -/// - 'a': letters -/// - '1': numbers -/// - '/': symbols -const BLOCKS: &str = " a1/"; - -// Suffixes described by four-tuples of: -// -// - block the suffix is part of -// - mode in which the suffix is applicable -// - the suffix string -// - the resulting suffix NodeKind -fn suffixes() -> impl Iterator, &'static str, SyntaxKind)> -{ - [ - // Whitespace suffixes. - (' ', None, " ", Space(0)), - (' ', None, "\n", Space(1)), - (' ', None, "\r", Space(1)), - (' ', None, "\r\n", Space(1)), - // Letter suffixes. - ('a', Some(Markup), "hello", Text("hello")), - ('a', Some(Markup), "💚", Text("💚")), - ('a', Some(Code), "val", Ident("val")), - ('a', Some(Code), "α", Ident("α")), - ('a', Some(Code), "_", Ident("_")), - // Number suffixes. - ('1', Some(Code), "2", Int(2)), - ('1', Some(Code), ".2", Float(0.2)), - // Symbol suffixes. - ('/', None, "[", LeftBracket), - ('/', None, "//", LineComment), - ('/', None, "/**/", BlockComment), - ('/', Some(Markup), "*", Star), - ('/', Some(Markup), r"\\", Escape('\\')), - ('/', Some(Markup), "#let", Let), - ('/', Some(Code), "(", LeftParen), - ('/', Some(Code), ":", Colon), - ('/', Some(Code), "+=", PlusEq), - ] - .into_iter() -} - -macro_rules! t { - (Both $($tts:tt)*) => { - t!(Markup $($tts)*); - t!(Code $($tts)*); - }; - ($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{ - // Test without suffix. - t!(@$mode: $text => $($token),*); - - // Test with each applicable suffix. - for (block, mode, suffix, ref token) in suffixes() { - let text = $text; - #[allow(unused_variables)] - let blocks = BLOCKS; - $(let blocks = $blocks;)? - assert!(!blocks.contains(|c| !BLOCKS.contains(c))); - if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) { - t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token); - } - } - }}; - (@$mode:ident: $text:expr => $($token:expr),*) => {{ - let text = $text; - let found = Tokens::new(&text, $mode).collect::>(); - let expected = vec![$($token.clone()),*]; - check(&text, found, expected); - }}; -} - -#[test] -fn test_tokenize_brackets() { - // Test in markup. - t!(Markup: "{" => LeftBrace); - t!(Markup: "}" => RightBrace); - t!(Markup: "[" => LeftBracket); - t!(Markup: "]" => RightBracket); - t!(Markup[" /"]: "(" => Text("(")); - t!(Markup[" /"]: ")" => Text(")")); - - // Test in code. - t!(Code: "{" => LeftBrace); - t!(Code: "}" => RightBrace); - t!(Code: "[" => LeftBracket); - t!(Code: "]" => RightBracket); - t!(Code: "(" => LeftParen); - t!(Code: ")" => RightParen); -} - -#[test] -fn test_tokenize_whitespace() { - // Test basic whitespace. - t!(Both["a1/"]: "" => ); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: "\t" => Space(0)); - t!(Both["a1/"]: " \t" => Space(0)); - t!(Both["a1/"]: "\u{202F}" => Space(0)); - - // Test newline counting. - t!(Both["a1/"]: "\n" => Space(1)); - t!(Both["a1/"]: "\n " => Space(1)); - t!(Both["a1/"]: " \n" => Space(1)); - t!(Both["a1/"]: " \n " => Space(1)); - t!(Both["a1/"]: "\r\n" => Space(1)); - t!(Both["a1/"]: "\r\n\r" => Space(2)); - t!(Both["a1/"]: " \n\t \n " => Space(2)); - t!(Both["a1/"]: "\n\r" => Space(2)); - t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); -} - -#[test] -fn test_tokenize_text() { - // Test basic text. - t!(Markup[" /"]: "hello" => Text("hello")); - t!(Markup[" /"]: "reha-world" => Text("reha-world")); - - // Test code symbols in text. - t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b")); - t!(Markup[" /"]: ";,|/+" => Text(";,|/+")); - t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a")); - t!(Markup[" "]: "#123" => Text("#123")); - - // Test text ends. - t!(Markup[""]: "hello " => Text("hello"), Space(0)); - t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}')); -} - -#[test] -fn test_tokenize_escape_sequences() { - // Test escapable symbols. - t!(Markup: r"\\" => Escape('\\')); - t!(Markup: r"\/" => Escape('/')); - t!(Markup: r"\[" => Escape('[')); - t!(Markup: r"\]" => Escape(']')); - t!(Markup: r"\{" => Escape('{')); - t!(Markup: r"\}" => Escape('}')); - t!(Markup: r"\*" => Escape('*')); - t!(Markup: r"\_" => Escape('_')); - t!(Markup: r"\=" => Escape('=')); - t!(Markup: r"\~" => Escape('~')); - t!(Markup: r"\'" => Escape('\'')); - t!(Markup: r#"\""# => Escape('"')); - t!(Markup: r"\`" => Escape('`')); - t!(Markup: r"\$" => Escape('$')); - t!(Markup: r"\#" => Escape('#')); - t!(Markup: r"\a" => Escape('a')); - t!(Markup: r"\u" => Escape('u')); - t!(Markup: r"\1" => Escape('1')); - - // Test basic unicode escapes. - t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence")); - t!(Markup: r"\u{2603}" => Escape('☃')); - t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence")); - - // Test unclosed unicode escapes. - t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace")); - t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace); -} - -#[test] -fn test_tokenize_markup_symbols() { - // Test markup tokens. - t!(Markup[" a1"]: "*" => Star); - t!(Markup: "_" => Underscore); - t!(Markup[""]: "===" => Eq, Eq, Eq); - t!(Markup["a1/"]: "= " => Eq, Space(0)); - t!(Markup[" "]: r"\" => Linebreak); - t!(Markup: "~" => Shorthand('\u{00A0}')); - t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}')); - t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}')); - t!(Markup["a1/"]: "- " => Minus, Space(0)); - t!(Markup[" "]: "+" => Plus); - t!(Markup[" "]: "1." => EnumNumbering(NonZeroUsize::new(1).unwrap())); - t!(Markup[" "]: "1.a" => EnumNumbering(NonZeroUsize::new(1).unwrap()), Text("a")); - t!(Markup[" /"]: "a1." => Text("a1.")); -} - -#[test] -fn test_tokenize_code_symbols() { - // Test all symbols. - t!(Code: "," => Comma); - t!(Code: ";" => Semicolon); - t!(Code: ":" => Colon); - t!(Code: "+" => Plus); - t!(Code: "-" => Minus); - t!(Code[" a1"]: "*" => Star); - t!(Code[" a1"]: "/" => Slash); - t!(Code[" a/"]: "." => Dot); - t!(Code: "=" => Eq); - t!(Code: "==" => EqEq); - t!(Code: "!=" => ExclEq); - t!(Code[" /"]: "<" => Lt); - t!(Code: "<=" => LtEq); - t!(Code: ">" => Gt); - t!(Code: ">=" => GtEq); - t!(Code: "+=" => PlusEq); - t!(Code: "-=" => HyphEq); - t!(Code: "*=" => StarEq); - t!(Code: "/=" => SlashEq); - t!(Code: ".." => Dots); - t!(Code: "=>" => Arrow); - - // Test combinations. - t!(Code: "<=>" => LtEq, Gt); - t!(Code[" a/"]: "..." => Dots, Dot); - - // Test hyphen as symbol vs part of identifier. - t!(Code[" /"]: "-1" => Minus, Int(1)); - t!(Code[" /"]: "-a" => Minus, Ident("a")); - t!(Code[" /"]: "--1" => Minus, Minus, Int(1)); - t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a")); - t!(Code[" /"]: "a-b" => Ident("a-b")); - - // Test invalid. - t!(Code: r"\" => Error(Full, "not valid here")); -} - -#[test] -fn test_tokenize_keywords() { - // A list of a few (not all) keywords. - let list = [ - ("not", Not), - ("let", Let), - ("if", If), - ("else", Else), - ("for", For), - ("in", In), - ("import", Import), - ]; - - for (s, t) in list.clone() { - t!(Markup[" "]: format!("#{}", s) => t); - t!(Markup[" "]: format!("#{0}#{0}", s) => t, t); - t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}"))); - } - - for (s, t) in list { - t!(Code[" "]: s => t); - t!(Markup[" /"]: s => Text(s)); - } - - // Test simple identifier. - t!(Markup[" "]: "#letter" => Ident("letter")); - t!(Code[" /"]: "falser" => Ident("falser")); - t!(Code[" /"]: "None" => Ident("None")); - t!(Code[" /"]: "True" => Ident("True")); -} - -#[test] -fn test_tokenize_raw_blocks() { - // Test basic raw block. - t!(Markup: "``" => Raw("", None, false)); - t!(Markup: "`raw`" => Raw("raw", None, false)); - t!(Markup[""]: "`]" => Error(End, "expected 1 backtick")); - - // Test special symbols in raw block. - t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false)); - t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick")); - - // Test separated closing backticks. - t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false)); - - // Test more backticks. - t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false)); - t!(Markup: "````🚀````" => Raw("", None, false)); - t!(Markup[""]: "`````👩‍🚀````noend" => Error(End, "expected 5 backticks")); - t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false)); -} - -#[test] -fn test_tokenize_idents() { - // Test valid identifiers. - t!(Code[" /"]: "x" => Ident("x")); - t!(Code[" /"]: "value" => Ident("value")); - t!(Code[" /"]: "__main__" => Ident("__main__")); - t!(Code[" /"]: "_snake_case" => Ident("_snake_case")); - - // Test non-ascii. - t!(Code[" /"]: "α" => Ident("α")); - t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ")); - - // Test hyphen parsed as identifier. - t!(Code[" /"]: "kebab-case" => Ident("kebab-case")); - t!(Code[" /"]: "one-10" => Ident("one-10")); -} - -#[test] -fn test_tokenize_numeric() { - let ints = [("7", 7), ("012", 12)]; - let floats = [ - (".3", 0.3), - ("0.3", 0.3), - ("3.", 3.0), - ("3.0", 3.0), - ("14.3", 14.3), - ("10e2", 1000.0), - ("10e+0", 10.0), - ("10e+1", 100.0), - ("10e-2", 0.1), - ("10.e1", 100.0), - ("10.e-1", 1.0), - (".1e1", 1.0), - ("10E2", 1000.0), - ]; - - // Test integers. - for &(s, v) in &ints { - t!(Code[" /"]: s => Int(v)); - } - - // Test floats. - for &(s, v) in &floats { - t!(Code[" /"]: s => Float(v)); - } - - // Test attached numbers. - t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3)); - t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3)); - t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3)); - - // Test float from too large integer. - let large = i64::MAX as f64 + 1.0; - t!(Code[" /"]: large.to_string() => Float(large)); - - // Combined integers and floats. - let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats); - - let suffixes: &[(&str, fn(f64) -> SyntaxKind)] = &[ - ("mm", |x| Numeric(x, Unit::Length(AbsUnit::Mm))), - ("pt", |x| Numeric(x, Unit::Length(AbsUnit::Pt))), - ("cm", |x| Numeric(x, Unit::Length(AbsUnit::Cm))), - ("in", |x| Numeric(x, Unit::Length(AbsUnit::In))), - ("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))), - ("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))), - ("em", |x| Numeric(x, Unit::Em)), - ("fr", |x| Numeric(x, Unit::Fr)), - ("%", |x| Numeric(x, Unit::Percent)), - ]; - - // Numeric types. - for &(suffix, build) in suffixes { - for (s, v) in nums.clone() { - t!(Code[" /"]: format!("{}{}", s, suffix) => build(v)); - } - } - - // Multiple dots close the number. - t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2)); - t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3)); - t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3)); - - // Test invalid. - t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix")); -} - -#[test] -fn test_tokenize_strings() { - // Test basic strings. - t!(Code: "\"hi\"" => Str("hi")); - t!(Code: "\"hi\nthere\"" => Str("hi\nthere")); - t!(Code: "\"🌎\"" => Str("🌎")); - - // Test unterminated. - t!(Code[""]: "\"hi" => Error(End, "expected quote")); - - // Test escaped quote. - t!(Code: r#""a\"bc""# => Str("a\"bc")); - t!(Code[""]: r#""\""# => Error(End, "expected quote")); -} - -#[test] -fn test_tokenize_line_comments() { - // Test line comment with no trailing newline. - t!(Both[""]: "//" => LineComment); - - // Test line comment ends at newline. - t!(Both["a1/"]: "//bc\n" => LineComment, Space(1)); - t!(Both["a1/"]: "// bc \n" => LineComment, Space(1)); - t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1)); - - // Test nested line comments. - t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1)); -} - -#[test] -fn test_tokenize_block_comments() { - // Test basic block comments. - t!(Both[""]: "/*" => BlockComment); - t!(Both: "/**/" => BlockComment); - t!(Both: "/*🏞*/" => BlockComment); - t!(Both: "/*\n*/" => BlockComment); - - // Test depth 1 and 2 nested block comments. - t!(Both: "/* /* */ */" => BlockComment); - t!(Both: "/*/*/**/*/*/" => BlockComment); - - // Test two nested, one unclosed block comments. - t!(Both[""]: "/*/*/**/*/" => BlockComment); - - // Test all combinations of up to two following slashes and stars. - t!(Both[""]: "/*" => BlockComment); - t!(Both[""]: "/*/" => BlockComment); - t!(Both[""]: "/**" => BlockComment); - t!(Both[""]: "/*//" => BlockComment); - t!(Both[""]: "/*/*" => BlockComment); - t!(Both[""]: "/**/" => BlockComment); - t!(Both[""]: "/***" => BlockComment); - - // Test unexpected terminator. - t!(Both: "/*Hi*/*/" => BlockComment, - Error(Full, "unexpected end of block comment")); -} diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs deleted file mode 100644 index 02bbd3a4..00000000 --- a/src/syntax/tokens.rs +++ /dev/null @@ -1,769 +0,0 @@ -use std::num::NonZeroUsize; -use std::sync::Arc; - -use unicode_xid::UnicodeXID; -use unscanny::Scanner; - -use super::resolve::{resolve_hex, resolve_raw, resolve_string}; -use super::{ErrorPos, RawFields, SyntaxKind, Unit}; -use crate::geom::{AbsUnit, AngleUnit}; -use crate::util::{format_eco, EcoString}; - -/// An iterator over the tokens of a string of source code. -#[derive(Clone)] -pub struct Tokens<'s> { - /// The underlying scanner. - s: Scanner<'s>, - /// The mode the scanner is in. This determines what tokens it recognizes. - mode: TokenMode, - /// Whether the last token has been terminated. - terminated: bool, - /// Offsets the indentation on the first line of the source. - column_offset: usize, -} - -/// What kind of tokens to emit. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum TokenMode { - /// Text and markup. - Markup, - /// Math atoms, operators, etc. - Math, - /// Keywords, literals and operators. - Code, -} - -impl<'s> Tokens<'s> { - /// Create a new token iterator with the given mode. - pub fn new(text: &'s str, mode: TokenMode) -> Self { - Self::with_prefix("", text, mode) - } - - /// Create a new token iterator with the given mode and a prefix to offset - /// column calculations. - pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { - Self { - s: Scanner::new(text), - mode, - terminated: true, - column_offset: column(prefix, prefix.len(), 0), - } - } - - /// Get the current token mode. - pub fn mode(&self) -> TokenMode { - self.mode - } - - /// Change the token mode. - pub fn set_mode(&mut self, mode: TokenMode) { - self.mode = mode; - } - - /// The index in the string at which the last token ends and next token - /// will start. - pub fn cursor(&self) -> usize { - self.s.cursor() - } - - /// Jump to the given index in the string. - pub fn jump(&mut self, index: usize) { - self.s.jump(index); - } - - /// The underlying scanner. - pub fn scanner(&self) -> Scanner<'s> { - self.s - } - - /// Whether the last token was terminated. - pub fn terminated(&self) -> bool { - self.terminated - } - - /// The column index of a given index in the source string. - pub fn column(&self, index: usize) -> usize { - column(self.s.string(), index, self.column_offset) - } -} - -impl Iterator for Tokens<'_> { - type Item = SyntaxKind; - - /// Parse the next token in the source code. - fn next(&mut self) -> Option { - let start = self.s.cursor(); - let c = self.s.eat()?; - Some(match c { - // Trivia. - '/' if self.s.eat_if('/') => self.line_comment(), - '/' if self.s.eat_if('*') => self.block_comment(), - '*' if self.s.eat_if('/') => SyntaxKind::Error( - ErrorPos::Full, - "unexpected end of block comment".into(), - ), - c if c.is_whitespace() => self.whitespace(c), - - // Other things. - _ => match self.mode { - TokenMode::Markup => self.markup(start, c), - TokenMode::Math => self.math(start, c), - TokenMode::Code => self.code(start, c), - }, - }) - } -} - -/// Shared. -impl Tokens<'_> { - fn line_comment(&mut self) -> SyntaxKind { - self.s.eat_until(is_newline); - if self.s.peek().is_none() { - self.terminated = false; - } - SyntaxKind::LineComment - } - - fn block_comment(&mut self) -> SyntaxKind { - let mut state = '_'; - let mut depth = 1; - self.terminated = false; - - // Find the first `*/` that does not correspond to a nested `/*`. - while let Some(c) = self.s.eat() { - state = match (state, c) { - ('*', '/') => { - depth -= 1; - if depth == 0 { - self.terminated = true; - break; - } - '_' - } - ('/', '*') => { - depth += 1; - '_' - } - ('/', '/') => { - self.line_comment(); - '_' - } - _ => c, - } - } - - SyntaxKind::BlockComment - } - - fn whitespace(&mut self, c: char) -> SyntaxKind { - if c == ' ' && !self.s.at(char::is_whitespace) { - return SyntaxKind::Space { newlines: 0 }; - } - - self.s.uneat(); - - // Count the number of newlines. - let mut newlines = 0; - while let Some(c) = self.s.eat() { - if !c.is_whitespace() { - self.s.uneat(); - break; - } - - if is_newline(c) { - if c == '\r' { - self.s.eat_if('\n'); - } - newlines += 1; - } - } - - SyntaxKind::Space { newlines } - } -} - -impl Tokens<'_> { - fn markup(&mut self, start: usize, c: char) -> SyntaxKind { - match c { - // Blocks. - '{' => SyntaxKind::LeftBrace, - '}' => SyntaxKind::RightBrace, - '[' => SyntaxKind::LeftBracket, - ']' => SyntaxKind::RightBracket, - - // Multi-char things. - '#' => self.hash(start), - '.' if self.s.eat_if("..") => SyntaxKind::Shorthand('\u{2026}'), - '-' => self.hyph(), - ':' => self.colon(), - 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { - self.link(start) - } - '`' => self.raw(), - c if c.is_ascii_digit() => self.numbering(start), - '<' if self.s.at(is_id_continue) => self.label(), - '@' if self.s.at(is_id_continue) => self.reference(), - - // Escape sequences. - '\\' => self.backslash(), - - // Single-char things. - '~' => SyntaxKind::Shorthand('\u{00A0}'), - '\'' => SyntaxKind::SmartQuote { double: false }, - '"' => SyntaxKind::SmartQuote { double: true }, - '*' if !self.in_word() => SyntaxKind::Star, - '_' if !self.in_word() => SyntaxKind::Underscore, - '$' => SyntaxKind::Dollar, - '=' => SyntaxKind::Eq, - '+' => SyntaxKind::Plus, - '/' => SyntaxKind::Slash, - - // Plain text. - _ => self.text(start), - } - } - - fn text(&mut self, start: usize) -> SyntaxKind { - macro_rules! table { - ($(|$c:literal)*) => {{ - let mut t = [false; 128]; - $(t[$c as usize] = true;)* - t - }} - } - - const TABLE: [bool; 128] = table! { - | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' - | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' - | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' - }; - - loop { - self.s.eat_until(|c: char| { - TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) - }); - - // Continue with the same text node if the thing would become text - // anyway. - let mut s = self.s; - match s.eat() { - Some('/') if !s.at(['/', '*']) => {} - Some(' ') if s.at(char::is_alphanumeric) => {} - Some('-') if !s.at(['-', '?']) => {} - Some('.') if !s.at("..") => {} - Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} - Some('@' | '#') if !s.at(is_id_start) => {} - _ => break, - } - - self.s = s; - } - - SyntaxKind::Text(self.s.from(start).into()) - } - - fn backslash(&mut self) -> SyntaxKind { - match self.s.peek() { - Some('u') if self.s.eat_if("u{") => { - let sequence = self.s.eat_while(char::is_ascii_alphanumeric); - if self.s.eat_if('}') { - if let Some(c) = resolve_hex(sequence) { - SyntaxKind::Escape(c) - } else { - SyntaxKind::Error( - ErrorPos::Full, - "invalid unicode escape sequence".into(), - ) - } - } else { - self.terminated = false; - SyntaxKind::Error(ErrorPos::End, "expected closing brace".into()) - } - } - - // Linebreaks. - Some(c) if c.is_whitespace() => SyntaxKind::Linebreak, - None => SyntaxKind::Linebreak, - - // Escapes. - Some(c) => { - self.s.expect(c); - SyntaxKind::Escape(c) - } - } - } - - fn hash(&mut self, start: usize) -> SyntaxKind { - if self.s.eat_if('{') { - SyntaxKind::LeftBrace - } else if self.s.eat_if('[') { - SyntaxKind::LeftBracket - } else if self.s.at(is_id_start) { - let read = self.s.eat_while(is_id_continue); - match keyword(read) { - Some(keyword) => keyword, - None => SyntaxKind::Ident(read.into()), - } - } else if self.mode == TokenMode::Markup { - self.text(start) - } else { - SyntaxKind::Atom("#".into()) - } - } - - fn hyph(&mut self) -> SyntaxKind { - if self.s.eat_if('-') { - if self.s.eat_if('-') { - SyntaxKind::Shorthand('\u{2014}') - } else { - SyntaxKind::Shorthand('\u{2013}') - } - } else if self.s.eat_if('?') { - SyntaxKind::Shorthand('\u{00AD}') - } else { - SyntaxKind::Minus - } - } - - fn colon(&mut self) -> SyntaxKind { - let start = self.s.cursor(); - let mut end = start; - while !self.s.eat_while(char::is_ascii_alphanumeric).is_empty() && self.s.at(':') - { - end = self.s.cursor(); - self.s.eat(); - } - - self.s.jump(end); - - if start < end { - self.s.expect(':'); - SyntaxKind::Symbol(self.s.get(start..end).into()) - } else if self.mode == TokenMode::Markup { - SyntaxKind::Colon - } else { - SyntaxKind::Atom(":".into()) - } - } - - fn link(&mut self, start: usize) -> SyntaxKind { - #[rustfmt::skip] - self.s.eat_while(|c: char| matches!(c, - | '0' ..= '9' - | 'a' ..= 'z' - | 'A' ..= 'Z' - | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' - | '\'' | '.' | ',' | ';' - )); - if self.s.scout(-1) == Some('.') { - self.s.uneat(); - } - SyntaxKind::Link(self.s.from(start).into()) - } - - fn raw(&mut self) -> SyntaxKind { - let column = self.column(self.s.cursor() - 1); - - let mut backticks = 1; - while self.s.eat_if('`') { - backticks += 1; - } - - // Special case for empty inline block. - if backticks == 2 { - return SyntaxKind::Raw(Arc::new(RawFields { - text: EcoString::new(), - lang: None, - block: false, - })); - } - - let start = self.s.cursor(); - let mut found = 0; - while found < backticks { - match self.s.eat() { - Some('`') => found += 1, - Some(_) => found = 0, - None => break, - } - } - - if found == backticks { - let end = self.s.cursor() - found as usize; - SyntaxKind::Raw(Arc::new(resolve_raw( - column, - backticks, - self.s.get(start..end), - ))) - } else { - self.terminated = false; - let remaining = backticks - found; - let noun = if remaining == 1 { "backtick" } else { "backticks" }; - SyntaxKind::Error( - ErrorPos::End, - if found == 0 { - format_eco!("expected {} {}", remaining, noun) - } else { - format_eco!("expected {} more {}", remaining, noun) - }, - ) - } - } - - fn numbering(&mut self, start: usize) -> SyntaxKind { - self.s.eat_while(char::is_ascii_digit); - let read = self.s.from(start); - if self.s.eat_if('.') { - if let Ok(number) = read.parse::() { - return match NonZeroUsize::new(number) { - Some(number) => SyntaxKind::EnumNumbering(number), - None => SyntaxKind::Error(ErrorPos::Full, "must be positive".into()), - }; - } - } - - self.text(start) - } - - fn reference(&mut self) -> SyntaxKind { - SyntaxKind::Ref(self.s.eat_while(is_id_continue).into()) - } - - fn in_word(&self) -> bool { - let alphanumeric = |c: Option| c.map_or(false, |c| c.is_alphanumeric()); - let prev = self.s.scout(-2); - let next = self.s.peek(); - alphanumeric(prev) && alphanumeric(next) - } -} - -/// Math. -impl Tokens<'_> { - fn math(&mut self, start: usize, c: char) -> SyntaxKind { - match c { - // Symbol shorthands. - '|' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{21A6}'), - '<' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{2194}'), - '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand('\u{21D4}'), - '!' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2260}'), - '<' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2264}'), - '>' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2265}'), - '<' if self.s.eat_if('-') => SyntaxKind::Shorthand('\u{2190}'), - '-' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{2192}'), - '=' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{21D2}'), - ':' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2254}'), - - // Multi-char things. - '#' => self.hash(start), - - // Escape sequences. - '\\' => self.backslash(), - - // Single-char things. - '_' => SyntaxKind::Underscore, - '^' => SyntaxKind::Hat, - '/' => SyntaxKind::Slash, - '&' => SyntaxKind::Amp, - '$' => SyntaxKind::Dollar, - - // Symbol notation. - ':' => self.colon(), - - // Strings. - '"' => self.string(), - - // Identifiers and symbol notation. - c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { - self.s.eat_while(is_math_id_continue); - - let mut symbol = false; - while self.s.eat_if(':') - && !self.s.eat_while(char::is_alphanumeric).is_empty() - { - symbol = true; - } - - if symbol { - SyntaxKind::Symbol(self.s.from(start).into()) - } else { - if self.s.scout(-1) == Some(':') { - self.s.uneat(); - } - - SyntaxKind::Ident(self.s.from(start).into()) - } - } - - // Numbers. - c if c.is_numeric() => { - self.s.eat_while(char::is_numeric); - SyntaxKind::Atom(self.s.from(start).into()) - } - - // Other math atoms. - c => SyntaxKind::Atom(c.into()), - } - } -} - -/// Code. -impl Tokens<'_> { - fn code(&mut self, start: usize, c: char) -> SyntaxKind { - match c { - // Blocks. - '{' => SyntaxKind::LeftBrace, - '}' => SyntaxKind::RightBrace, - '[' => SyntaxKind::LeftBracket, - ']' => SyntaxKind::RightBracket, - - // Parentheses. - '(' => SyntaxKind::LeftParen, - ')' => SyntaxKind::RightParen, - - // Math. - '$' => SyntaxKind::Dollar, - - // Labels and raw. - '<' if self.s.at(is_id_continue) => self.label(), - '`' => self.raw(), - - // Two-char operators. - '=' if self.s.eat_if('=') => SyntaxKind::EqEq, - '!' if self.s.eat_if('=') => SyntaxKind::ExclEq, - '<' if self.s.eat_if('=') => SyntaxKind::LtEq, - '>' if self.s.eat_if('=') => SyntaxKind::GtEq, - '+' if self.s.eat_if('=') => SyntaxKind::PlusEq, - '-' if self.s.eat_if('=') => SyntaxKind::HyphEq, - '*' if self.s.eat_if('=') => SyntaxKind::StarEq, - '/' if self.s.eat_if('=') => SyntaxKind::SlashEq, - '.' if self.s.eat_if('.') => SyntaxKind::Dots, - '=' if self.s.eat_if('>') => SyntaxKind::Arrow, - - // Single-char operators. - ',' => SyntaxKind::Comma, - ';' => SyntaxKind::Semicolon, - ':' => SyntaxKind::Colon, - '+' => SyntaxKind::Plus, - '-' => SyntaxKind::Minus, - '*' => SyntaxKind::Star, - '/' => SyntaxKind::Slash, - '=' => SyntaxKind::Eq, - '<' => SyntaxKind::Lt, - '>' => SyntaxKind::Gt, - '.' if !self.s.at(char::is_ascii_digit) => SyntaxKind::Dot, - - // Identifiers. - c if is_id_start(c) => self.ident(start), - - // Numbers. - c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => { - self.number(start, c) - } - - // Strings. - '"' => self.string(), - - // Invalid token. - _ => SyntaxKind::Error(ErrorPos::Full, "not valid here".into()), - } - } - - fn ident(&mut self, start: usize) -> SyntaxKind { - self.s.eat_while(is_id_continue); - match self.s.from(start) { - "none" => SyntaxKind::None, - "auto" => SyntaxKind::Auto, - "true" => SyntaxKind::Bool(true), - "false" => SyntaxKind::Bool(false), - id => keyword(id).unwrap_or_else(|| SyntaxKind::Ident(id.into())), - } - } - - fn number(&mut self, start: usize, c: char) -> SyntaxKind { - // Read the first part (integer or fractional depending on `first`). - self.s.eat_while(char::is_ascii_digit); - - // Read the fractional part if not already done. - // Make sure not to confuse a range for the decimal separator. - if c != '.' && !self.s.at("..") && self.s.eat_if('.') { - self.s.eat_while(char::is_ascii_digit); - } - - // Read the exponent. - if !self.s.at("em") && self.s.eat_if(['e', 'E']) { - self.s.eat_if(['+', '-']); - self.s.eat_while(char::is_ascii_digit); - } - - // Read the suffix. - let suffix_start = self.s.cursor(); - if !self.s.eat_if('%') { - self.s.eat_while(char::is_ascii_alphanumeric); - } - - let number = self.s.get(start..suffix_start); - let suffix = self.s.from(suffix_start); - - // Find out whether it is a simple number. - if suffix.is_empty() { - if let Ok(i) = number.parse::() { - return SyntaxKind::Int(i); - } - } - - let Ok(v) = number.parse::() else { - return SyntaxKind::Error(ErrorPos::Full, "invalid number".into()); - }; - - match suffix { - "" => SyntaxKind::Float(v), - "pt" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Pt)), - "mm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Mm)), - "cm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Cm)), - "in" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::In)), - "deg" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Deg)), - "rad" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Rad)), - "em" => SyntaxKind::Numeric(v, Unit::Em), - "fr" => SyntaxKind::Numeric(v, Unit::Fr), - "%" => SyntaxKind::Numeric(v, Unit::Percent), - _ => SyntaxKind::Error(ErrorPos::Full, "invalid number suffix".into()), - } - } - - fn string(&mut self) -> SyntaxKind { - let mut escaped = false; - let verbatim = self.s.eat_until(|c| { - if c == '"' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }); - - let string = resolve_string(verbatim); - if self.s.eat_if('"') { - SyntaxKind::Str(string) - } else { - self.terminated = false; - SyntaxKind::Error(ErrorPos::End, "expected quote".into()) - } - } - - fn label(&mut self) -> SyntaxKind { - let label = self.s.eat_while(is_id_continue); - if self.s.eat_if('>') { - if !label.is_empty() { - SyntaxKind::Label(label.into()) - } else { - SyntaxKind::Error(ErrorPos::Full, "label cannot be empty".into()) - } - } else { - self.terminated = false; - SyntaxKind::Error(ErrorPos::End, "expected closing angle bracket".into()) - } - } -} - -/// Try to parse an identifier into a keyword. -fn keyword(ident: &str) -> Option { - Some(match ident { - "not" => SyntaxKind::Not, - "and" => SyntaxKind::And, - "or" => SyntaxKind::Or, - "let" => SyntaxKind::Let, - "set" => SyntaxKind::Set, - "show" => SyntaxKind::Show, - "if" => SyntaxKind::If, - "else" => SyntaxKind::Else, - "for" => SyntaxKind::For, - "in" => SyntaxKind::In, - "while" => SyntaxKind::While, - "break" => SyntaxKind::Break, - "continue" => SyntaxKind::Continue, - "return" => SyntaxKind::Return, - "import" => SyntaxKind::Import, - "include" => SyntaxKind::Include, - "as" => SyntaxKind::As, - _ => return None, - }) -} - -/// The column index of a given index in the source string, given a column -/// offset for the first line. -fn column(string: &str, index: usize, offset: usize) -> usize { - let mut apply_offset = false; - let res = string[..index] - .char_indices() - .rev() - .take_while(|&(_, c)| !is_newline(c)) - .inspect(|&(i, _)| { - if i == 0 { - apply_offset = true - } - }) - .count(); - - // The loop is never executed if the slice is empty, but we are of - // course still at the start of the first line. - if index == 0 { - apply_offset = true; - } - - if apply_offset { - res + offset - } else { - res - } -} - -/// Whether this character denotes a newline. -#[inline] -pub fn is_newline(character: char) -> bool { - matches!( - character, - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' - ) -} - -/// Whether a string is a valid unicode identifier. -/// -/// In addition to what is specified in the [Unicode Standard][uax31], we allow: -/// - `_` as a starting character, -/// - `_` and `-` as continuing characters. -/// -/// [uax31]: http://www.unicode.org/reports/tr31/ -#[inline] -pub fn is_ident(string: &str) -> bool { - let mut chars = string.chars(); - chars - .next() - .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) -} - -/// Whether a character can start an identifier. -#[inline] -fn is_id_start(c: char) -> bool { - c.is_xid_start() || c == '_' -} - -/// Whether a character can continue an identifier. -#[inline] -fn is_id_continue(c: char) -> bool { - c.is_xid_continue() || c == '_' || c == '-' -} - -/// Whether a character can start an identifier in math. -#[inline] -fn is_math_id_start(c: char) -> bool { - c.is_xid_start() -} - -/// Whether a character can continue an identifier in math. -#[inline] -fn is_math_id_continue(c: char) -> bool { - c.is_xid_continue() && c != '_' -} -- cgit v1.2.3