From 5a8534a395b500a25cbc46ee15ec031c8231de59 Mon Sep 17 00:00:00 2001 From: Laurenz Date: Tue, 4 Oct 2022 13:42:49 +0200 Subject: Parse basic math syntax --- src/parse/mod.rs | 152 +++++++++++++++++++++++++++++++++++++++++++++------- src/parse/parser.rs | 34 ++++++++---- src/parse/tokens.rs | 142 +++++++++++++++++++++++++----------------------- 3 files changed, 234 insertions(+), 94 deletions(-) (limited to 'src/parse') diff --git a/src/parse/mod.rs b/src/parse/mod.rs index ed8bc5ce..7eb7343b 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -11,7 +11,7 @@ pub use tokens::*; use std::collections::HashSet; -use crate::syntax::ast::{Associativity, BinOp, UnOp}; +use crate::syntax::ast::{Assoc, BinOp, UnOp}; use crate::syntax::{NodeKind, SpanPos, SyntaxNode}; use crate::util::EcoString; @@ -22,11 +22,22 @@ pub fn parse(text: &str) -> SyntaxNode { p.finish().into_iter().next().unwrap() } +/// Parse math directly, only used for syntax highlighting. +pub fn parse_math(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, TokenMode::Math); + p.perform(NodeKind::Math, |p| { + while !p.eof() { + math_node(p); + } + }); + p.finish().into_iter().next().unwrap() +} + /// Parse code directly, only used for syntax highlighting. -pub fn parse_code(text: &str) -> Vec { +pub fn parse_code(text: &str) -> SyntaxNode { let mut p = Parser::new(text, TokenMode::Code); - code(&mut p); - p.finish() + p.perform(NodeKind::CodeBlock, code); + p.finish().into_iter().next().unwrap() } /// Reparse a code block. @@ -240,20 +251,20 @@ fn markup_node(p: &mut Parser, at_start: &mut bool) { // Text and markup. NodeKind::Text(_) | NodeKind::Linebreak { .. } - | NodeKind::NonBreakingSpace - | NodeKind::Shy - | NodeKind::EnDash - | NodeKind::EmDash - | NodeKind::Ellipsis + | NodeKind::Tilde + | NodeKind::HyphQuest + | NodeKind::Hyph2 + | NodeKind::Hyph3 + | NodeKind::Dot3 | NodeKind::Quote { .. } | NodeKind::Escape(_) | NodeKind::Link(_) | NodeKind::Raw(_) - | NodeKind::Math(_) | NodeKind::Label(_) - | NodeKind::Ref(_) => { - p.eat(); - } + | NodeKind::Ref(_) => p.eat(), + + // Math. + NodeKind::Dollar => math(p), // Strong, emph, heading. NodeKind::Star => strong(p), @@ -405,6 +416,111 @@ fn markup_expr(p: &mut Parser) { p.end_group(); } +/// Parse math. +fn math(p: &mut Parser) { + p.perform(NodeKind::Math, |p| { + p.start_group(Group::Math); + while !p.eof() { + math_node(p); + } + p.end_group(); + }); +} + +/// Parse a math node. +fn math_node(p: &mut Parser) { + math_node_prec(p, 0, None) +} + +/// Parse a math node with operators having at least the minimum precedence. +fn math_node_prec(p: &mut Parser, min_prec: usize, stop: Option) { + let marker = p.marker(); + math_primary(p); + + loop { + let (kind, mut prec, assoc, stop) = match p.peek() { + v if v == stop.as_ref() => break, + Some(NodeKind::Underscore) => { + (NodeKind::Script, 2, Assoc::Right, Some(NodeKind::Hat)) + } + Some(NodeKind::Hat) => ( + NodeKind::Script, + 2, + Assoc::Right, + Some(NodeKind::Underscore), + ), + Some(NodeKind::Slash) => (NodeKind::Frac, 1, Assoc::Left, None), + _ => break, + }; + + if prec < min_prec { + break; + } + + match assoc { + Assoc::Left => prec += 1, + Assoc::Right => {} + } + + p.eat(); + math_node_prec(p, prec, stop); + + // Allow up to two different scripts. We do not risk encountering the + // previous script kind again here due to right-associativity. + if p.eat_if(NodeKind::Underscore) || p.eat_if(NodeKind::Hat) { + math_node_prec(p, prec, None); + } + + marker.end(p, kind); + } +} + +/// Parse a primary math node. +fn math_primary(p: &mut Parser) { + let token = match p.peek() { + Some(t) => t, + None => return, + }; + + match token { + // Spaces, atoms and expressions. + NodeKind::Space { .. } + | NodeKind::Linebreak + | NodeKind::Escape(_) + | NodeKind::Atom(_) + | NodeKind::Ident(_) => p.eat(), + + // Groups. + NodeKind::LeftParen => group(p, Group::Paren), + NodeKind::LeftBracket => group(p, Group::Bracket), + NodeKind::LeftBrace => group(p, Group::Brace), + + // Alignment indactor. + NodeKind::Amp => align(p), + + _ => p.unexpected(), + } +} + +/// Parse grouped math. +fn group(p: &mut Parser, group: Group) { + p.perform(NodeKind::Math, |p| { + p.start_group(group); + while !p.eof() { + math_node(p); + } + p.end_group(); + }) +} + +/// Parse an alignment indicator. +fn align(p: &mut Parser) { + p.perform(NodeKind::Align, |p| { + p.assert(NodeKind::Amp); + while p.eat_if(NodeKind::Amp) {} + }) +} + /// Parse an expression. fn expr(p: &mut Parser) -> ParseResult { expr_prec(p, false, 0) @@ -434,7 +550,7 @@ fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { loop { // Parenthesis or bracket means this is a function call. if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { - marker.perform(p, NodeKind::FuncCall, |p| args(p))?; + marker.perform(p, NodeKind::FuncCall, args)?; continue; } @@ -446,7 +562,7 @@ fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { if p.eat_if(NodeKind::Dot) { ident(p)?; if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { - marker.perform(p, NodeKind::MethodCall, |p| args(p))?; + marker.perform(p, NodeKind::MethodCall, args)?; } else { marker.end(p, NodeKind::FieldAccess); } @@ -474,9 +590,9 @@ fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { p.eat(); - match op.associativity() { - Associativity::Left => prec += 1, - Associativity::Right => {} + match op.assoc() { + Assoc::Left => prec += 1, + Assoc::Right => {} } marker.perform(p, NodeKind::BinaryExpr, |p| expr_prec(p, atomic, prec))?; diff --git a/src/parse/parser.rs b/src/parse/parser.rs index f8ea9614..12dd324b 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -92,14 +92,14 @@ impl<'s> Parser<'s> { let until = self.trivia_start(); let mut children = mem::replace(&mut self.children, prev); - if self.tokens.mode() == TokenMode::Code { + if self.tokens.mode() == TokenMode::Markup { + self.children.push(InnerNode::with_children(kind, children).into()); + } else { // Trailing trivia should not be wrapped into the new node. let idx = self.children.len(); self.children.push(SyntaxNode::default()); self.children.extend(children.drain(until.0 ..)); self.children[idx] = InnerNode::with_children(kind, children).into(); - } else { - self.children.push(InnerNode::with_children(kind, children).into()); } output @@ -122,7 +122,7 @@ impl<'s> Parser<'s> { self.prev_end = self.tokens.cursor(); self.bump(); - if self.tokens.mode() == TokenMode::Code { + if self.tokens.mode() != TokenMode::Markup { // Skip whitespace and comments. while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) { self.bump(); @@ -232,8 +232,17 @@ impl<'s> Parser<'s> { pub fn start_group(&mut self, kind: Group) { self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() }); self.tokens.set_mode(match kind { - Group::Bracket | Group::Strong | Group::Emph => TokenMode::Markup, - Group::Brace | Group::Paren | Group::Expr | Group::Imports => TokenMode::Code, + Group::Strong | Group::Emph => TokenMode::Markup, + Group::Bracket => match self.tokens.mode() { + TokenMode::Math => TokenMode::Math, + _ => TokenMode::Markup, + }, + Group::Brace | Group::Paren => match self.tokens.mode() { + TokenMode::Math => TokenMode::Math, + _ => TokenMode::Code, + }, + Group::Math => TokenMode::Math, + Group::Expr | Group::Imports => TokenMode::Code, }); match kind { @@ -242,6 +251,7 @@ impl<'s> Parser<'s> { Group::Paren => self.assert(NodeKind::LeftParen), Group::Strong => self.assert(NodeKind::Star), Group::Emph => self.assert(NodeKind::Underscore), + Group::Math => self.assert(NodeKind::Dollar), Group::Expr => self.repeek(), Group::Imports => self.repeek(), } @@ -260,11 +270,12 @@ impl<'s> Parser<'s> { // Eat the end delimiter if there is one. if let Some((end, required)) = match group.kind { - Group::Paren => Some((NodeKind::RightParen, true)), - Group::Bracket => Some((NodeKind::RightBracket, true)), Group::Brace => Some((NodeKind::RightBrace, true)), + Group::Bracket => Some((NodeKind::RightBracket, true)), + Group::Paren => Some((NodeKind::RightParen, true)), Group::Strong => Some((NodeKind::Star, true)), Group::Emph => Some((NodeKind::Underscore, true)), + Group::Math => Some((NodeKind::Dollar, true)), Group::Expr => Some((NodeKind::Semicolon, false)), Group::Imports => None, } { @@ -290,7 +301,7 @@ impl<'s> Parser<'s> { // Rescan the peeked token if the mode changed. if rescan { let mut target = self.prev_end(); - if group_mode == TokenMode::Code { + if group_mode != TokenMode::Markup { let start = self.trivia_start().0; target = self.current_start - self.children[start ..].iter().map(SyntaxNode::len).sum::(); @@ -330,6 +341,7 @@ impl<'s> Parser<'s> { Some(NodeKind::RightParen) => self.inside(Group::Paren), Some(NodeKind::Star) => self.inside(Group::Strong), Some(NodeKind::Underscore) => self.inside(Group::Emph), + Some(NodeKind::Dollar) => self.inside(Group::Math), Some(NodeKind::Semicolon) => self.inside(Group::Expr), Some(NodeKind::From) => self.inside(Group::Imports), Some(NodeKind::Space { newlines }) => self.space_ends_group(*newlines), @@ -472,7 +484,7 @@ impl Marker { } // Don't expose trivia in code. - if p.tokens.mode() == TokenMode::Code && child.kind().is_trivia() { + if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() { continue; } @@ -515,6 +527,8 @@ pub enum Group { Strong, /// A group surrounded with underscore: `_..._`. Emph, + /// A group surrounded by dollar signs: `$...$`. + Math, /// A group ended by a semicolon or a line break: `;`, `\n`. Expr, /// A group for import items, ended by a semicolon, line break or `from`. diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index f6d4b0e8..d495afa0 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -5,7 +5,7 @@ use unscanny::Scanner; use super::resolve::{resolve_hex, resolve_raw, resolve_string}; use crate::geom::{AngleUnit, LengthUnit}; -use crate::syntax::ast::{MathNode, RawNode, Unit}; +use crate::syntax::ast::{RawNode, Unit}; use crate::syntax::{NodeKind, SpanPos}; use crate::util::EcoString; @@ -27,6 +27,8 @@ pub struct Tokens<'s> { pub enum TokenMode { /// Text and markup. Markup, + /// Math atoms, operators, etc. + Math, /// Keywords, literals and operators. Code, } @@ -103,23 +105,16 @@ impl<'s> Iterator for Tokens<'s> { let start = self.s.cursor(); let c = self.s.eat()?; Some(match c { - // Comments. + // Trivia. '/' if self.s.eat_if('/') => self.line_comment(), '/' if self.s.eat_if('*') => self.block_comment(), '*' if self.s.eat_if('/') => NodeKind::Unknown("*/".into()), - - // Blocks. - '{' => NodeKind::LeftBrace, - '}' => NodeKind::RightBrace, - '[' => NodeKind::LeftBracket, - ']' => NodeKind::RightBracket, - - // Whitespace. c if c.is_whitespace() => self.whitespace(c), // Other things. _ => match self.mode { TokenMode::Markup => self.markup(start, c), + TokenMode::Math => self.math(start, c), TokenMode::Code => self.code(start, c), }, }) @@ -195,16 +190,23 @@ impl<'s> Tokens<'s> { #[inline] fn markup(&mut self, start: usize, c: char) -> NodeKind { match c { + // Blocks. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + // Escape sequences. '\\' => self.backslash(), // Single-char things. - '~' => NodeKind::NonBreakingSpace, - '.' if self.s.eat_if("..") => NodeKind::Ellipsis, + '~' => NodeKind::Tilde, + '.' if self.s.eat_if("..") => NodeKind::Dot3, '\'' => NodeKind::Quote { double: false }, '"' => NodeKind::Quote { double: true }, '*' if !self.in_word() => NodeKind::Star, '_' if !self.in_word() => NodeKind::Underscore, + '$' => NodeKind::Dollar, '=' => NodeKind::Eq, '+' => NodeKind::Plus, '/' => NodeKind::Slash, @@ -217,7 +219,6 @@ impl<'s> Tokens<'s> { self.link(start) } '`' => self.raw(), - '$' => self.math(), c if c.is_ascii_digit() => self.numbering(start), '<' => self.label(), '@' => self.reference(start), @@ -313,12 +314,12 @@ impl<'s> Tokens<'s> { fn hyph(&mut self) -> NodeKind { if self.s.eat_if('-') { if self.s.eat_if('-') { - NodeKind::EmDash + NodeKind::Hyph3 } else { - NodeKind::EnDash + NodeKind::Hyph2 } } else if self.s.eat_if('?') { - NodeKind::Shy + NodeKind::HyphQuest } else { NodeKind::Minus } @@ -395,29 +396,6 @@ impl<'s> Tokens<'s> { } } - fn math(&mut self) -> NodeKind { - let mut escaped = false; - let formula = self.s.eat_until(|c| { - if c == '$' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }); - - let display = formula.len() >= 2 - && formula.starts_with(char::is_whitespace) - && formula.ends_with(char::is_whitespace); - - if self.s.eat_if('$') { - NodeKind::Math(Arc::new(MathNode { formula: formula.into(), display })) - } else { - self.terminated = false; - NodeKind::Error(SpanPos::End, "expected dollar sign".into()) - } - } - fn numbering(&mut self, start: usize) -> NodeKind { self.s.eat_while(char::is_ascii_digit); let read = self.s.from(start); @@ -453,8 +431,51 @@ impl<'s> Tokens<'s> { } } + fn math(&mut self, start: usize, c: char) -> NodeKind { + match c { + // Escape sequences. + '\\' => self.backslash(), + + // Single-char things. + '_' => NodeKind::Underscore, + '^' => NodeKind::Hat, + '/' => NodeKind::Slash, + '&' => NodeKind::Amp, + '$' => NodeKind::Dollar, + + // Brackets. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + '(' => NodeKind::LeftParen, + ')' => NodeKind::RightParen, + + // Identifiers. + c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { + self.s.eat_while(is_math_id_continue); + NodeKind::Ident(self.s.from(start).into()) + } + + // Numbers. + c if c.is_numeric() => { + self.s.eat_while(char::is_numeric); + NodeKind::Atom(self.s.from(start).into()) + } + + // Other math atoms. + c => NodeKind::Atom(c.into()), + } + } + fn code(&mut self, start: usize, c: char) -> NodeKind { match c { + // Blocks. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + // Parentheses. '(' => NodeKind::LeftParen, ')' => NodeKind::RightParen, @@ -673,6 +694,18 @@ fn is_id_continue(c: char) -> bool { c.is_xid_continue() || c == '_' || c == '-' } +/// Whether a character can start an identifier in math. +#[inline] +fn is_math_id_start(c: char) -> bool { + c.is_xid_start() +} + +/// Whether a character can continue an identifier in math. +#[inline] +fn is_math_id_continue(c: char) -> bool { + c.is_xid_continue() && c != '_' +} + #[cfg(test)] #[allow(non_snake_case)] mod tests { @@ -696,10 +729,6 @@ mod tests { })) } - fn Math(formula: &str, display: bool) -> NodeKind { - NodeKind::Math(Arc::new(MathNode { formula: formula.into(), display })) - } - fn Str(string: &str) -> NodeKind { NodeKind::Str(string.into()) } @@ -770,7 +799,6 @@ mod tests { ('/', None, "//", LineComment), ('/', None, "/**/", BlockComment), ('/', Some(Markup), "*", Star), - ('/', Some(Markup), "$ $", Math(" ", false)), ('/', Some(Markup), r"\\", Escape('\\')), ('/', Some(Markup), "#let", Let), ('/', Some(Code), "(", LeftParen), @@ -853,7 +881,7 @@ mod tests { // Test text ends. t!(Markup[""]: "hello " => Text("hello"), Space(0)); - t!(Markup[""]: "hello~" => Text("hello"), NonBreakingSpace); + t!(Markup[""]: "hello~" => Text("hello"), Tilde); } #[test] @@ -899,9 +927,9 @@ mod tests { t!(Markup[""]: "===" => Eq, Eq, Eq); t!(Markup["a1/"]: "= " => Eq, Space(0)); t!(Markup[" "]: r"\" => Linebreak); - t!(Markup: "~" => NonBreakingSpace); - t!(Markup["a1/"]: "-?" => Shy); - t!(Markup["a "]: r"a--" => Text("a"), EnDash); + t!(Markup: "~" => Tilde); + t!(Markup["a1/"]: "-?" => HyphQuest); + t!(Markup["a "]: r"a--" => Text("a"), Hyph2); t!(Markup["a1/"]: "- " => Minus, Space(0)); t!(Markup[" "]: "+" => Plus); t!(Markup[" "]: "1." => EnumNumbering(1)); @@ -998,24 +1026,6 @@ mod tests { t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false)); } - #[test] - fn test_tokenize_math_formulas() { - // Test basic formula. - t!(Markup: "$$" => Math("", false)); - t!(Markup: "$x$" => Math("x", false)); - t!(Markup: r"$\\$" => Math(r"\\", false)); - t!(Markup: r"$[\\]$" => Math(r"[\\]", false)); - t!(Markup: "$ x + y $" => Math(" x + y ", true)); - - // Test unterminated. - t!(Markup[""]: "$x" => Error(End, "expected dollar sign")); - t!(Markup[""]: "$[x]\n" => Error(End, "expected dollar sign")); - - // Test escape sequences. - t!(Markup: r"$\$x$" => Math(r"\$x", false)); - t!(Markup: r"$\ \$ $" => Math(r"\ \$ ", false)); - } - #[test] fn test_tokenize_idents() { // Test valid identifiers. -- cgit v1.2.3