diff options
| author | Laurenz <laurmaedje@gmail.com> | 2022-10-17 19:26:24 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2022-10-17 20:04:22 +0200 |
| commit | e21822665591dc19766275da1e185215a6b945ef (patch) | |
| tree | 7788e211c3c33c8b5a8ad7d5eb7574e33631eb16 /src/parse | |
| parent | 4fd031a256b2ecfe524859d5599fafb386395572 (diff) | |
Merge some modules
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/incremental.rs | 520 | ||||
| -rw-r--r-- | src/parse/mod.rs | 1166 | ||||
| -rw-r--r-- | src/parse/parser.rs | 559 | ||||
| -rw-r--r-- | src/parse/resolve.rs | 238 | ||||
| -rw-r--r-- | src/parse/tokens.rs | 1176 |
5 files changed, 0 insertions, 3659 deletions
diff --git a/src/parse/incremental.rs b/src/parse/incremental.rs deleted file mode 100644 index 4651a784..00000000 --- a/src/parse/incremental.rs +++ /dev/null @@ -1,520 +0,0 @@ -use std::ops::Range; -use std::sync::Arc; - -use crate::syntax::{InnerNode, NodeKind, Span, SyntaxNode}; - -use super::{ - is_newline, parse, reparse_code_block, reparse_content_block, reparse_markup_elements, -}; - -/// Refresh the given syntax node with as little parsing as possible. -/// -/// Takes the new source, the range in the old source that was replaced and the -/// length of the replacement. -/// -/// Returns the range in the new source that was ultimately reparsed. -pub fn reparse( - root: &mut SyntaxNode, - text: &str, - replaced: Range<usize>, - replacement_len: usize, -) -> Range<usize> { - if let SyntaxNode::Inner(inner) = root { - let change = Change { text, replaced, replacement_len }; - if let Some(range) = try_reparse(&change, Arc::make_mut(inner), 0, true, true) { - return range; - } - } - - let id = root.span().source(); - *root = parse(text); - root.numberize(id, Span::FULL).unwrap(); - 0 .. text.len() -} - -/// Try to reparse inside the given node. -fn try_reparse( - change: &Change, - node: &mut InnerNode, - mut offset: usize, - outermost: bool, - safe_to_replace: bool, -) -> Option<Range<usize>> { - let is_markup = matches!(node.kind(), NodeKind::Markup { .. }); - let original_count = node.children().len(); - let original_offset = offset; - - let mut search = SearchState::default(); - let mut ahead: Option<Ahead> = None; - - // Whether the first node that should be replaced is at start. - let mut at_start = true; - - // Whether the last searched child is the outermost child. - let mut child_outermost = false; - - // Find the the first child in the range of children to reparse. - for (i, child) in node.children().enumerate() { - let pos = NodePos { idx: i, offset }; - let child_span = offset .. offset + child.len(); - child_outermost = outermost && i + 1 == original_count; - - match search { - SearchState::NoneFound => { - // The edit is contained within the span of the current element. - if child_span.contains(&change.replaced.start) - && child_span.end >= change.replaced.end - { - // In Markup mode, we want to consider a non-whitespace - // neighbor if the edit is on the node boundary. - search = if is_markup && child_span.end == change.replaced.end { - SearchState::RequireNonTrivia(pos) - } else { - SearchState::Contained(pos) - }; - } else if child_span.contains(&change.replaced.start) { - search = SearchState::Inside(pos); - } else if child_span.end == change.replaced.start - && change.replaced.start == change.replaced.end - && child_outermost - { - search = SearchState::SpanFound(pos, pos); - } else { - // Update compulsary state of `ahead_nontrivia`. - if let Some(ahead_nontrivia) = ahead.as_mut() { - if let NodeKind::Space { newlines: (1 ..) } = child.kind() { - ahead_nontrivia.newline(); - } - } - - // We look only for non spaces, non-semicolon and also - // reject text that points to the special case for URL - // evasion and line comments. - if !child.kind().is_space() - && child.kind() != &NodeKind::Semicolon - && child.kind() != &NodeKind::Text('/'.into()) - && (ahead.is_none() || change.replaced.start > child_span.end) - && !ahead.map_or(false, Ahead::is_compulsory) - { - ahead = Some(Ahead::new(pos, at_start, is_bounded(child.kind()))); - } - - at_start = next_at_start(child.kind(), at_start); - } - } - SearchState::Inside(start) => { - if child_span.end == change.replaced.end { - search = SearchState::RequireNonTrivia(start); - } else if child_span.end > change.replaced.end { - search = SearchState::SpanFound(start, pos); - } - } - SearchState::RequireNonTrivia(start) => { - if !child.kind().is_trivia() { - search = SearchState::SpanFound(start, pos); - } - } - _ => unreachable!(), - } - - offset += child.len(); - - if search.done().is_some() { - break; - } - } - - // If we were looking for a non-whitespace element and hit the end of - // the file here, we instead use EOF as the end of the span. - if let SearchState::RequireNonTrivia(start) = search { - search = SearchState::SpanFound(start, NodePos { - idx: node.children().len() - 1, - offset: offset - node.children().last().unwrap().len(), - }) - } - - if let SearchState::Contained(pos) = search { - // Do not allow replacement of elements inside of constructs whose - // opening and closing brackets look the same. - let safe_inside = is_bounded(node.kind()); - let child = &mut node.children_mut()[pos.idx]; - let prev_len = child.len(); - let prev_descendants = child.descendants(); - - if let Some(range) = match child { - SyntaxNode::Inner(node) => try_reparse( - change, - Arc::make_mut(node), - pos.offset, - child_outermost, - safe_inside, - ), - SyntaxNode::Leaf(_) => None, - } { - let new_len = child.len(); - let new_descendants = child.descendants(); - node.update_parent(prev_len, new_len, prev_descendants, new_descendants); - return Some(range); - } - - let superseded_span = pos.offset .. pos.offset + prev_len; - let func: Option<ReparseMode> = match child.kind() { - NodeKind::CodeBlock => Some(ReparseMode::Code), - NodeKind::ContentBlock => Some(ReparseMode::Content), - _ => None, - }; - - // Return if the element was reparsable on its own, otherwise try to - // treat it as a markup element. - if let Some(func) = func { - if let Some(result) = replace( - change, - node, - func, - pos.idx .. pos.idx + 1, - superseded_span, - outermost, - ) { - return Some(result); - } - } - } - - // Make sure this is a markup node and that we may replace. If so, save - // the current indent. - let min_indent = match node.kind() { - NodeKind::Markup { min_indent } if safe_to_replace => *min_indent, - _ => return None, - }; - - let (mut start, end) = search.done()?; - if let Some(ahead) = ahead { - if start.offset == change.replaced.start || ahead.is_compulsory() { - start = ahead.pos; - at_start = ahead.at_start; - } - } else { - start = NodePos { idx: 0, offset: original_offset }; - } - - let superseded_span = - start.offset .. end.offset + node.children().as_slice()[end.idx].len(); - - replace( - change, - node, - ReparseMode::MarkupElements { at_start, min_indent }, - start.idx .. end.idx + 1, - superseded_span, - outermost, - ) -} - -/// Reparse the superseded nodes and replace them. -fn replace( - change: &Change, - node: &mut InnerNode, - mode: ReparseMode, - superseded_idx: Range<usize>, - superseded_span: Range<usize>, - outermost: bool, -) -> Option<Range<usize>> { - let superseded_start = superseded_idx.start; - - let differential: isize = - change.replacement_len as isize - change.replaced.len() as isize; - let newborn_end = (superseded_span.end as isize + differential) as usize; - let newborn_span = superseded_span.start .. newborn_end; - - let mut prefix = ""; - for (i, c) in change.text[.. newborn_span.start].char_indices().rev() { - if is_newline(c) { - break; - } - prefix = &change.text[i .. newborn_span.start]; - } - - let (newborns, terminated, amount) = match mode { - ReparseMode::Code => reparse_code_block( - &prefix, - &change.text[newborn_span.start ..], - newborn_span.len(), - ), - ReparseMode::Content => reparse_content_block( - &prefix, - &change.text[newborn_span.start ..], - newborn_span.len(), - ), - ReparseMode::MarkupElements { at_start, min_indent } => reparse_markup_elements( - &prefix, - &change.text[newborn_span.start ..], - newborn_span.len(), - differential, - &node.children().as_slice()[superseded_start ..], - at_start, - min_indent, - ), - }?; - - // Do not accept unclosed nodes if the old node wasn't at the right edge - // of the tree. - if !outermost && !terminated { - return None; - } - - node.replace_children(superseded_start .. superseded_start + amount, newborns) - .ok()?; - - Some(newborn_span) -} - -/// A description of a change. -struct Change<'a> { - /// The new source code, with the change applied. - text: &'a str, - /// Which range in the old source file was changed. - replaced: Range<usize>, - /// How many characters replaced the text in `replaced`. - replacement_len: usize, -} - -/// Encodes the state machine of the search for the nodes are pending for -/// replacement. -#[derive(Clone, Copy, Debug, PartialEq)] -enum SearchState { - /// Neither an end nor a start have been found as of now. - /// The latest non-trivia child is continually saved. - NoneFound, - /// The search has concluded by finding a node that fully contains the - /// modifications. - Contained(NodePos), - /// The search has found the start of the modified nodes. - Inside(NodePos), - /// The search has found the end of the modified nodes but the change - /// touched its boundries so another non-trivia node is needed. - RequireNonTrivia(NodePos), - /// The search has concluded by finding a start and an end index for nodes - /// with a pending reparse. - SpanFound(NodePos, NodePos), -} - -impl Default for SearchState { - fn default() -> Self { - Self::NoneFound - } -} - -impl SearchState { - fn done(self) -> Option<(NodePos, NodePos)> { - match self { - Self::NoneFound => None, - Self::Contained(s) => Some((s, s)), - Self::Inside(_) => None, - Self::RequireNonTrivia(_) => None, - Self::SpanFound(s, e) => Some((s, e)), - } - } -} - -/// The position of a syntax node. -#[derive(Clone, Copy, Debug, PartialEq)] -struct NodePos { - /// The index in the parent node. - idx: usize, - /// The byte offset in the string. - offset: usize, -} - -/// An ahead node with an index and whether it is `at_start`. -#[derive(Clone, Copy, Debug, PartialEq)] -struct Ahead { - /// The position of the node. - pos: NodePos, - /// The `at_start` before this node. - at_start: bool, - /// The kind of ahead node. - kind: AheadKind, -} - -/// The kind of ahead node. -#[derive(Clone, Copy, Debug, PartialEq)] -enum AheadKind { - /// A normal non-trivia child has been found. - Normal, - /// An unbounded child has been found. The boolean indicates whether it was - /// on the current line, in which case adding it to the reparsing range is - /// compulsory. - Unbounded(bool), -} - -impl Ahead { - fn new(pos: NodePos, at_start: bool, bounded: bool) -> Self { - Self { - pos, - at_start, - kind: if bounded { - AheadKind::Normal - } else { - AheadKind::Unbounded(true) - }, - } - } - - fn newline(&mut self) { - if let AheadKind::Unbounded(current_line) = &mut self.kind { - *current_line = false; - } - } - - fn is_compulsory(self) -> bool { - matches!(self.kind, AheadKind::Unbounded(true)) - } -} - -/// Which reparse function to choose for a span of elements. -#[derive(Clone, Copy, Debug, PartialEq)] -enum ReparseMode { - /// Reparse a code block, including its braces. - Code, - /// Reparse a content block, including its square brackets. - Content, - /// Reparse elements of the markup. Also specified the initial `at_start` - /// state for the reparse and the minimum indent of the reparsed nodes. - MarkupElements { at_start: bool, min_indent: usize }, -} - -/// Whether changes _inside_ this node are safely encapsulated, so that only -/// this node must be reparsed. -fn is_bounded(kind: &NodeKind) -> bool { - match kind { - NodeKind::CodeBlock - | NodeKind::ContentBlock - | NodeKind::Linebreak - | NodeKind::SmartQuote { .. } - | NodeKind::BlockComment - | NodeKind::Space { .. } - | NodeKind::Escape(_) - | NodeKind::Shorthand(_) => true, - _ => false, - } -} - -/// Whether `at_start` would still be true after this node given the -/// previous value of the property. -fn next_at_start(kind: &NodeKind, prev: bool) -> bool { - match kind { - NodeKind::Space { newlines: (1 ..) } => true, - NodeKind::Space { .. } | NodeKind::LineComment | NodeKind::BlockComment => prev, - _ => false, - } -} - -#[cfg(test)] -#[rustfmt::skip] -mod tests { - use super::*; - use crate::parse::parse; - use crate::parse::tests::check; - use crate::source::Source; - - #[track_caller] - fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) { - let mut source = Source::detached(prev); - let range = source.edit(range, with); - check(source.text(), source.root(), &parse(source.text())); - assert_eq!(range, goal); - } - - #[test] - fn test_parse_incremental_simple_replacements() { - test("hello world", 7 .. 12, "walkers", 0 .. 14); - test("some content", 0..12, "", 0..0); - test("", 0..0, "do it", 0..5); - test("a d e", 1 .. 3, " b c d", 0 .. 9); - test("*~ *", 2..2, "*", 0..5); - test("_1_\n2a\n3", 5..5, "4", 4..7); - test("_1_\n2a\n3~", 8..8, "4", 4..10); - test("_1_ 2 3a\n4", 7..7, "5", 0..9); - test("* {1+2} *", 5..6, "3", 2..7); - test("a #f() e", 1 .. 6, " b c d", 0 .. 9); - test("a\nb\nc\nd\ne\n", 5 .. 5, "c", 2 .. 7); - test("a\n\nb\n\nc\n\nd\n\ne\n", 7 .. 7, "c", 3 .. 10); - test("a\nb\nc *hel a b lo* d\nd\ne", 13..13, "c ", 4..20); - test("~~ {a} ~~", 4 .. 5, "b", 3 .. 6); - test("{(0, 1, 2)}", 5 .. 6, "11pt", 0..14); - test("\n= A heading", 4 .. 4, "n evocative", 0 .. 23); - test("for~your~thing", 9 .. 9, "a", 0 .. 15); - test("a your thing a", 6 .. 7, "a", 0 .. 14); - test("{call(); abc}", 7 .. 7, "[]", 0 .. 15); - test("#call() abc", 7 .. 7, "[]", 0 .. 10); - test("hi[\n- item\n- item 2\n - item 3]", 11 .. 11, " ", 2 .. 35); - test("hi\n- item\nno item\n - item 3", 10 .. 10, "- ", 3..19); - test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 16 .. 20, "none", 0..99); - test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 33 .. 42, "[_gronk_]", 33..42); - test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 34 .. 41, "_bar_", 33 .. 40); - test("{let i=1; for x in range(5) {i}}", 6 .. 6, " ", 0 .. 33); - test("{let i=1; for x in range(5) {i}}", 13 .. 14, " ", 0 .. 33); - test("hello~~{x}", 7 .. 10, "#f()", 0 .. 11); - test("this~is -- in my opinion -- spectacular", 8 .. 10, "---", 0 .. 25); - test("understanding `code` is complicated", 15 .. 15, "C ", 0 .. 22); - test("{ let x = g() }", 10 .. 12, "f(54", 0 .. 17); - test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 18); - test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18); - test("#for", 4 .. 4, "//", 0 .. 6); - test("#show a: f as b..", 16..16, "c", 0..18); - test("a\n#let \nb", 7 .. 7, "i", 2 .. 9); - test("a\n#for i \nb", 9 .. 9, "in", 2 .. 12); - test("a~https://fun/html", 13..14, "n", 0..18); - } - - #[test] - fn test_parse_incremental_whitespace_invariants() { - test("hello \\ world", 7 .. 8, "a ", 0 .. 14); - test("hello \\ world", 7 .. 8, " a", 0 .. 14); - test("x = y", 1 .. 1, " + y", 0 .. 6); - test("x = y", 1 .. 1, " + y\n", 0 .. 7); - test("abc\n= a heading\njoke", 3 .. 4, "\nmore\n\n", 0 .. 21); - test("abc\n= a heading\njoke", 3 .. 4, "\nnot ", 0 .. 19); - test("#let x = (1, 2 + ;~ Five\r\n\r", 20 .. 23, "2.", 0 .. 23); - test("hey #myfriend", 4 .. 4, "\\", 0 .. 14); - test("hey #myfriend", 4 .. 4, "\\", 0 .. 6); - test("= foo\nbar\n - a\n - b", 6 .. 9, "", 0 .. 11); - test("= foo\n bar\n baz", 6 .. 8, "", 0 .. 9); - test(" // hi", 1 .. 1, " ", 0 .. 7); - test("- \nA", 2..3, "", 0..3); - } - - #[test] - fn test_parse_incremental_type_invariants() { - test("a #for x in array {x}", 18 .. 21, "[#x]", 0 .. 22); - test("a #let x = 1 {5}", 3 .. 6, "if", 0 .. 11); - test("a {let x = 1 {5}} b", 3 .. 6, "if", 2 .. 16); - test("#let x = 1 {5}", 4 .. 4, " if", 0 .. 13); - test("{let x = 1 {5}}", 4 .. 4, " if", 0 .. 18); - test("a // b c #f()", 3 .. 4, "", 0 .. 12); - test("{\nf()\n//g(a)\n}", 6 .. 8, "", 0 .. 12); - test("a{\nf()\n//g(a)\n}b", 7 .. 9, "", 1 .. 13); - test("a #while x {\n g(x) \n} b", 11 .. 11, "//", 0 .. 26); - test("{(1, 2)}", 1 .. 1, "while ", 0 .. 14); - test("a b c", 1 .. 1, "{[}", 0 .. 8); - } - - #[test] - fn test_parse_incremental_wrongly_or_unclosed_things() { - test(r#"{"hi"}"#, 4 .. 5, "c", 0 .. 6); - test(r"this \u{abcd}", 8 .. 9, "", 0 .. 12); - test(r"this \u{abcd} that", 12 .. 13, "", 0 .. 17); - test(r"{{let x = z}; a = 1} b", 6 .. 6, "//", 0 .. 24); - test("a b c", 1 .. 1, " /* letters */", 0 .. 19); - test("a b c", 1 .. 1, " /* letters", 0 .. 16); - test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters */", 0 .. 41); - test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters", 0 .. 38); - test("~~~~", 2 .. 2, "[]", 0 .. 5); - test("a[]b", 2 .. 2, "{", 1 .. 4); - test("[hello]", 2 .. 3, "]", 0 .. 7); - test("{a}", 1 .. 2, "b", 0 .. 3); - test("{ a; b; c }", 5 .. 6, "[}]", 0 .. 13); - test("#a()\n~", 3..4, "{}", 0..7); - test("[]\n~", 1..2, "#if i==0 {true}", 0..18); - } -} diff --git a/src/parse/mod.rs b/src/parse/mod.rs deleted file mode 100644 index ac8ec6eb..00000000 --- a/src/parse/mod.rs +++ /dev/null @@ -1,1166 +0,0 @@ -//! Parsing and tokenization. - -mod incremental; -mod parser; -mod resolve; -mod tokens; - -pub use incremental::*; -pub use parser::*; -pub use tokens::*; - -use std::collections::HashSet; - -use crate::syntax::ast::{Assoc, BinOp, UnOp}; -use crate::syntax::{ErrorPos, NodeKind, SyntaxNode}; -use crate::util::EcoString; - -/// Parse a source file. -pub fn parse(text: &str) -> SyntaxNode { - let mut p = Parser::new(text, TokenMode::Markup); - markup(&mut p, true); - p.finish().into_iter().next().unwrap() -} - -/// Parse code directly, only used for syntax highlighting. -pub fn parse_code(text: &str) -> SyntaxNode { - let mut p = Parser::new(text, TokenMode::Code); - p.perform(NodeKind::CodeBlock, code); - p.finish().into_iter().next().unwrap() -} - -/// Reparse a code block. -/// -/// Returns `Some` if all of the input was consumed. -fn reparse_code_block( - prefix: &str, - text: &str, - end_pos: usize, -) -> Option<(Vec<SyntaxNode>, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); - if !p.at(NodeKind::LeftBrace) { - return None; - } - - code_block(&mut p); - - let (mut node, terminated) = p.consume()?; - let first = node.remove(0); - if first.len() != end_pos { - return None; - } - - Some((vec![first], terminated, 1)) -} - -/// Reparse a content block. -/// -/// Returns `Some` if all of the input was consumed. -fn reparse_content_block( - prefix: &str, - text: &str, - end_pos: usize, -) -> Option<(Vec<SyntaxNode>, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); - if !p.at(NodeKind::LeftBracket) { - return None; - } - - content_block(&mut p); - - let (mut node, terminated) = p.consume()?; - let first = node.remove(0); - if first.len() != end_pos { - return None; - } - - Some((vec![first], terminated, 1)) -} - -/// Reparse a sequence markup elements without the topmost node. -/// -/// Returns `Some` if all of the input was consumed. -fn reparse_markup_elements( - prefix: &str, - text: &str, - end_pos: usize, - differential: isize, - reference: &[SyntaxNode], - mut at_start: bool, - min_indent: usize, -) -> Option<(Vec<SyntaxNode>, bool, usize)> { - let mut p = Parser::with_prefix(prefix, text, TokenMode::Markup); - - let mut node: Option<&SyntaxNode> = None; - let mut iter = reference.iter(); - let mut offset = differential; - let mut replaced = 0; - let mut stopped = false; - - 'outer: while !p.eof() { - if let Some(NodeKind::Space { newlines: (1 ..) }) = p.peek() { - if p.column(p.current_end()) < min_indent { - return None; - } - } - - markup_node(&mut p, &mut at_start); - - if p.prev_end() <= end_pos { - continue; - } - - let recent = p.marker().before(&p).unwrap(); - let recent_start = p.prev_end() - recent.len(); - - while offset <= recent_start as isize { - if let Some(node) = node { - // The nodes are equal, at the same position and have the - // same content. The parsing trees have converged again, so - // the reparse may stop here. - if offset == recent_start as isize && node == recent { - replaced -= 1; - stopped = true; - break 'outer; - } - } - - if let Some(node) = node { - offset += node.len() as isize; - } - - node = iter.next(); - if node.is_none() { - break; - } - - replaced += 1; - } - } - - if p.eof() && !stopped { - replaced = reference.len(); - } - - let (mut res, terminated) = p.consume()?; - if stopped { - res.pop().unwrap(); - } - - Some((res, terminated, replaced)) -} - -/// Parse markup. -/// -/// If `at_start` is true, things like headings that may only appear at the -/// beginning of a line or content block are initially allowed. -fn markup(p: &mut Parser, mut at_start: bool) { - p.perform(NodeKind::Markup { min_indent: 0 }, |p| { - while !p.eof() { - markup_node(p, &mut at_start); - } - }); -} - -/// Parse markup that stays right of the given `column`. -fn markup_indented(p: &mut Parser, min_indent: usize) { - p.eat_while(|t| match t { - NodeKind::Space { newlines } => *newlines == 0, - NodeKind::LineComment | NodeKind::BlockComment => true, - _ => false, - }); - - let marker = p.marker(); - let mut at_start = false; - - while !p.eof() { - match p.peek() { - Some(NodeKind::Space { newlines: (1 ..) }) - if p.column(p.current_end()) < min_indent => - { - break; - } - _ => {} - } - - markup_node(p, &mut at_start); - } - - marker.end(p, NodeKind::Markup { min_indent }); -} - -/// Parse a line of markup that can prematurely end if `f` returns true. -fn markup_line<F>(p: &mut Parser, mut f: F) -where - F: FnMut(&NodeKind) -> bool, -{ - p.eat_while(|t| match t { - NodeKind::Space { newlines } => *newlines == 0, - NodeKind::LineComment | NodeKind::BlockComment => true, - _ => false, - }); - - p.perform(NodeKind::Markup { min_indent: usize::MAX }, |p| { - let mut at_start = false; - while let Some(kind) = p.peek() { - if let NodeKind::Space { newlines: (1 ..) } = kind { - break; - } - - if f(kind) { - break; - } - - markup_node(p, &mut at_start); - } - }); -} - -/// Parse a markup node. -fn markup_node(p: &mut Parser, at_start: &mut bool) { - let token = match p.peek() { - Some(t) => t, - None => return, - }; - - match token { - // Whitespace. - NodeKind::Space { newlines } => { - *at_start |= *newlines > 0; - p.eat(); - return; - } - - // Comments. - NodeKind::LineComment | NodeKind::BlockComment => { - p.eat(); - return; - } - - // Text and markup. - NodeKind::Text(_) - | NodeKind::Linebreak - | NodeKind::SmartQuote { .. } - | NodeKind::Escape(_) - | NodeKind::Shorthand(_) - | NodeKind::Link(_) - | NodeKind::Raw(_) - | NodeKind::Label(_) - | NodeKind::Ref(_) => p.eat(), - - // Math. - NodeKind::Dollar => math(p), - - // Strong, emph, heading. - NodeKind::Star => strong(p), - NodeKind::Underscore => emph(p), - NodeKind::Eq => heading(p, *at_start), - - // Lists. - NodeKind::Minus => list_node(p, *at_start), - NodeKind::Plus | NodeKind::EnumNumbering(_) => enum_node(p, *at_start), - NodeKind::Slash => { - desc_node(p, *at_start).ok(); - } - NodeKind::Colon => { - let marker = p.marker(); - p.eat(); - marker.convert(p, NodeKind::Text(':'.into())); - } - - // Hashtag + keyword / identifier. - NodeKind::Ident(_) - | NodeKind::Let - | NodeKind::Set - | NodeKind::Show - | NodeKind::Wrap - | NodeKind::If - | NodeKind::While - | NodeKind::For - | NodeKind::Import - | NodeKind::Include - | NodeKind::Break - | NodeKind::Continue - | NodeKind::Return => markup_expr(p), - - // Code and content block. - NodeKind::LeftBrace => code_block(p), - NodeKind::LeftBracket => content_block(p), - - NodeKind::Error(_, _) => p.eat(), - _ => p.unexpected(), - }; - - *at_start = false; -} - -/// Parse strong content. -fn strong(p: &mut Parser) { - p.perform(NodeKind::Strong, |p| { - p.start_group(Group::Strong); - markup(p, false); - p.end_group(); - }) -} - -/// Parse emphasized content. -fn emph(p: &mut Parser) { - p.perform(NodeKind::Emph, |p| { - p.start_group(Group::Emph); - markup(p, false); - p.end_group(); - }) -} - -/// Parse a heading. -fn heading(p: &mut Parser, at_start: bool) { - let marker = p.marker(); - let current_start = p.current_start(); - p.assert(NodeKind::Eq); - while p.eat_if(NodeKind::Eq) {} - - if at_start && p.peek().map_or(true, |kind| kind.is_space()) { - p.eat_while(|kind| *kind == NodeKind::Space { newlines: 0 }); - markup_line(p, |kind| matches!(kind, NodeKind::Label(_))); - marker.end(p, NodeKind::Heading); - } else { - let text = p.get(current_start .. p.prev_end()).into(); - marker.convert(p, NodeKind::Text(text)); - } -} - -/// Parse a single list item. -fn list_node(p: &mut Parser, at_start: bool) { - let marker = p.marker(); - let text: EcoString = p.peek_src().into(); - p.assert(NodeKind::Minus); - - let min_indent = p.column(p.prev_end()); - if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { - markup_indented(p, min_indent); - marker.end(p, NodeKind::ListItem); - } else { - marker.convert(p, NodeKind::Text(text)); - } -} - -/// Parse a single enum item. -fn enum_node(p: &mut Parser, at_start: bool) { - let marker = p.marker(); - let text: EcoString = p.peek_src().into(); - p.eat(); - - let min_indent = p.column(p.prev_end()); - if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { - markup_indented(p, min_indent); - marker.end(p, NodeKind::EnumItem); - } else { - marker.convert(p, NodeKind::Text(text)); - } -} - -/// Parse a single description list item. -fn desc_node(p: &mut Parser, at_start: bool) -> ParseResult { - let marker = p.marker(); - let text: EcoString = p.peek_src().into(); - p.eat(); - - let min_indent = p.column(p.prev_end()); - if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { - markup_line(p, |node| matches!(node, NodeKind::Colon)); - p.expect(NodeKind::Colon)?; - markup_indented(p, min_indent); - marker.end(p, NodeKind::DescItem); - } else { - marker.convert(p, NodeKind::Text(text)); - } - - Ok(()) -} - -/// Parse an expression within a markup mode. -fn markup_expr(p: &mut Parser) { - // Does the expression need termination or can content follow directly? - let stmt = matches!( - p.peek(), - Some( - NodeKind::Let - | NodeKind::Set - | NodeKind::Show - | NodeKind::Wrap - | NodeKind::Import - | NodeKind::Include - ) - ); - - p.start_group(Group::Expr); - let res = expr_prec(p, true, 0); - if stmt && res.is_ok() && !p.eof() { - p.expected("semicolon or line break"); - } - p.end_group(); -} - -/// Parse math. -fn math(p: &mut Parser) { - p.perform(NodeKind::Math, |p| { - p.start_group(Group::Math); - while !p.eof() { - math_node(p); - } - p.end_group(); - }); -} - -/// Parse a math node. -fn math_node(p: &mut Parser) { - math_node_prec(p, 0, None) -} - -/// Parse a math node with operators having at least the minimum precedence. -fn math_node_prec(p: &mut Parser, min_prec: usize, stop: Option<NodeKind>) { - let marker = p.marker(); - math_primary(p); - - loop { - let (kind, mut prec, assoc, stop) = match p.peek() { - v if v == stop.as_ref() => break, - Some(NodeKind::Underscore) => { - (NodeKind::Script, 2, Assoc::Right, Some(NodeKind::Hat)) - } - Some(NodeKind::Hat) => ( - NodeKind::Script, - 2, - Assoc::Right, - Some(NodeKind::Underscore), - ), - Some(NodeKind::Slash) => (NodeKind::Frac, 1, Assoc::Left, None), - _ => break, - }; - - if prec < min_prec { - break; - } - - match assoc { - Assoc::Left => prec += 1, - Assoc::Right => {} - } - - p.eat(); - math_node_prec(p, prec, stop); - - // Allow up to two different scripts. We do not risk encountering the - // previous script kind again here due to right-associativity. - if p.eat_if(NodeKind::Underscore) || p.eat_if(NodeKind::Hat) { - math_node_prec(p, prec, None); - } - - marker.end(p, kind); - } -} - -/// Parse a primary math node. -fn math_primary(p: &mut Parser) { - let token = match p.peek() { - Some(t) => t, - None => return, - }; - - match token { - // Spaces, atoms and expressions. - NodeKind::Space { .. } - | NodeKind::Linebreak - | NodeKind::Escape(_) - | NodeKind::Atom(_) - | NodeKind::Ident(_) => p.eat(), - - // Groups. - NodeKind::LeftParen => group(p, Group::Paren, '(', ')'), - NodeKind::LeftBracket => group(p, Group::Bracket, '[', ']'), - NodeKind::LeftBrace => group(p, Group::Brace, '{', '}'), - - // Alignment indactor. - NodeKind::Amp => align(p), - - _ => p.unexpected(), - } -} - -/// Parse grouped math. -fn group(p: &mut Parser, group: Group, l: char, r: char) { - p.perform(NodeKind::Math, |p| { - let marker = p.marker(); - p.start_group(group); - marker.convert(p, NodeKind::Atom(l.into())); - while !p.eof() { - math_node(p); - } - let marker = p.marker(); - p.end_group(); - marker.convert(p, NodeKind::Atom(r.into())); - }) -} - -/// Parse an alignment indicator. -fn align(p: &mut Parser) { - p.perform(NodeKind::Align, |p| { - p.assert(NodeKind::Amp); - while p.eat_if(NodeKind::Amp) {} - }) -} - -/// Parse an expression. -fn expr(p: &mut Parser) -> ParseResult { - expr_prec(p, false, 0) -} - -/// Parse an expression with operators having at least the minimum precedence. -/// -/// If `atomic` is true, this does not parse binary operations and arrow -/// functions, which is exactly what we want in a shorthand expression directly -/// in markup. -/// -/// Stops parsing at operations with lower precedence than `min_prec`, -fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { - let marker = p.marker(); - - // Start the unary expression. - match p.peek().and_then(UnOp::from_token) { - Some(op) if !atomic => { - p.eat(); - let prec = op.precedence(); - expr_prec(p, atomic, prec)?; - marker.end(p, NodeKind::Unary); - } - _ => primary(p, atomic)?, - }; - - loop { - // Parenthesis or bracket means this is a function call. - if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { - marker.perform(p, NodeKind::FuncCall, args)?; - continue; - } - - if atomic { - break; - } - - // Method call or field access. - if p.eat_if(NodeKind::Dot) { - ident(p)?; - if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { - marker.perform(p, NodeKind::MethodCall, args)?; - } else { - marker.end(p, NodeKind::FieldAccess); - } - continue; - } - - let op = if p.eat_if(NodeKind::Not) { - if p.at(NodeKind::In) { - BinOp::NotIn - } else { - p.expected("keyword `in`"); - return Err(ParseError); - } - } else { - match p.peek().and_then(BinOp::from_token) { - Some(binop) => binop, - None => break, - } - }; - - let mut prec = op.precedence(); - if prec < min_prec { - break; - } - - p.eat(); - - match op.assoc() { - Assoc::Left => prec += 1, - Assoc::Right => {} - } - - marker.perform(p, NodeKind::Binary, |p| expr_prec(p, atomic, prec))?; - } - - Ok(()) -} - -/// Parse a primary expression. -fn primary(p: &mut Parser, atomic: bool) -> ParseResult { - if literal(p) { - return Ok(()); - } - - match p.peek() { - // Things that start with an identifier. - Some(NodeKind::Ident(_)) => { - let marker = p.marker(); - p.eat(); - - // Arrow means this is a closure's lone parameter. - if !atomic && p.at(NodeKind::Arrow) { - marker.end(p, NodeKind::Params); - p.assert(NodeKind::Arrow); - marker.perform(p, NodeKind::Closure, expr) - } else { - Ok(()) - } - } - - // Structures. - Some(NodeKind::LeftParen) => parenthesized(p, atomic), - Some(NodeKind::LeftBrace) => Ok(code_block(p)), - Some(NodeKind::LeftBracket) => Ok(content_block(p)), - - // Keywords. - Some(NodeKind::Let) => let_expr(p), - Some(NodeKind::Set) => set_expr(p), - Some(NodeKind::Show) => show_expr(p), - Some(NodeKind::Wrap) => wrap_expr(p), - Some(NodeKind::If) => if_expr(p), - Some(NodeKind::While) => while_expr(p), - Some(NodeKind::For) => for_expr(p), - Some(NodeKind::Import) => import_expr(p), - Some(NodeKind::Include) => include_expr(p), - Some(NodeKind::Break) => break_expr(p), - Some(NodeKind::Continue) => continue_expr(p), - Some(NodeKind::Return) => return_expr(p), - - Some(NodeKind::Error(_, _)) => { - p.eat(); - Err(ParseError) - } - - // Nothing. - _ => { - p.expected_found("expression"); - Err(ParseError) - } - } -} - -/// Parse a literal. -fn literal(p: &mut Parser) -> bool { - match p.peek() { - // Basic values. - Some( - NodeKind::None - | NodeKind::Auto - | NodeKind::Int(_) - | NodeKind::Float(_) - | NodeKind::Bool(_) - | NodeKind::Numeric(_, _) - | NodeKind::Str(_), - ) => { - p.eat(); - true - } - - _ => false, - } -} - -/// Parse an identifier. -fn ident(p: &mut Parser) -> ParseResult { - match p.peek() { - Some(NodeKind::Ident(_)) => { - p.eat(); - Ok(()) - } - _ => { - p.expected_found("identifier"); - Err(ParseError) - } - } -} - -/// Parse something that starts with a parenthesis, which can be either of: -/// - Array literal -/// - Dictionary literal -/// - Parenthesized expression -/// - Parameter list of closure expression -fn parenthesized(p: &mut Parser, atomic: bool) -> ParseResult { - let marker = p.marker(); - - p.start_group(Group::Paren); - let colon = p.eat_if(NodeKind::Colon); - let kind = collection(p, true).0; - p.end_group(); - - // Leading colon makes this a dictionary. - if colon { - dict(p, marker); - return Ok(()); - } - - // Arrow means this is a closure's parameter list. - if !atomic && p.at(NodeKind::Arrow) { - params(p, marker); - p.assert(NodeKind::Arrow); - return marker.perform(p, NodeKind::Closure, expr); - } - - // Transform into the identified collection. - match kind { - CollectionKind::Group => marker.end(p, NodeKind::Parenthesized), - CollectionKind::Positional => array(p, marker), - CollectionKind::Named => dict(p, marker), - } - - Ok(()) -} - -/// The type of a collection. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -enum CollectionKind { - /// The collection is only one item and has no comma. - Group, - /// The collection starts with a positional item and has multiple items or a - /// trailing comma. - Positional, - /// The collection starts with a colon or named item. - Named, -} - -/// Parse a collection. -/// -/// Returns the length of the collection and whether the literal contained any -/// commas. -fn collection(p: &mut Parser, keyed: bool) -> (CollectionKind, usize) { - let mut kind = None; - let mut items = 0; - let mut can_group = true; - let mut missing_coma: Option<Marker> = None; - - while !p.eof() { - if let Ok(item_kind) = item(p, keyed) { - match item_kind { - NodeKind::Spread => can_group = false, - NodeKind::Named if kind.is_none() => { - kind = Some(CollectionKind::Named); - can_group = false; - } - _ if kind.is_none() => { - kind = Some(CollectionKind::Positional); - } - _ => {} - } - - items += 1; - - if let Some(marker) = missing_coma.take() { - p.expected_at(marker, "comma"); - } - - if p.eof() { - break; - } - - if p.eat_if(NodeKind::Comma) { - can_group = false; - } else { - missing_coma = Some(p.trivia_start()); - } - } else { - p.eat_if(NodeKind::Comma); - kind = Some(CollectionKind::Group); - } - } - - let kind = if can_group && items == 1 { - CollectionKind::Group - } else { - kind.unwrap_or(CollectionKind::Positional) - }; - - (kind, items) -} - -/// Parse an expression or a named pair, returning whether it's a spread or a -/// named pair. -fn item(p: &mut Parser, keyed: bool) -> ParseResult<NodeKind> { - let marker = p.marker(); - if p.eat_if(NodeKind::Dots) { - marker.perform(p, NodeKind::Spread, expr)?; - return Ok(NodeKind::Spread); - } - - expr(p)?; - - if p.at(NodeKind::Colon) { - match marker.after(p).map(|c| c.kind()) { - Some(NodeKind::Ident(_)) => { - p.eat(); - marker.perform(p, NodeKind::Named, expr)?; - } - Some(NodeKind::Str(_)) if keyed => { - p.eat(); - marker.perform(p, NodeKind::Keyed, expr)?; - } - kind => { - let mut msg = EcoString::from("expected identifier"); - if keyed { - msg.push_str(" or string"); - } - if let Some(kind) = kind { - msg.push_str(", found "); - msg.push_str(kind.name()); - } - let error = NodeKind::Error(ErrorPos::Full, msg); - marker.end(p, error); - p.eat(); - marker.perform(p, NodeKind::Named, expr).ok(); - return Err(ParseError); - } - } - - Ok(NodeKind::Named) - } else { - Ok(NodeKind::None) - } -} - -/// Convert a collection into an array, producing errors for anything other than -/// expressions. -fn array(p: &mut Parser, marker: Marker) { - marker.filter_children(p, |x| match x.kind() { - NodeKind::Named | NodeKind::Keyed => Err("expected expression"), - _ => Ok(()), - }); - marker.end(p, NodeKind::Array); -} - -/// Convert a collection into a dictionary, producing errors for anything other -/// than named and keyed pairs. -fn dict(p: &mut Parser, marker: Marker) { - let mut used = HashSet::new(); - marker.filter_children(p, |x| match x.kind() { - kind if kind.is_paren() => Ok(()), - NodeKind::Named | NodeKind::Keyed => { - if let Some(NodeKind::Ident(key) | NodeKind::Str(key)) = - x.children().next().map(|child| child.kind()) - { - if !used.insert(key.clone()) { - return Err("pair has duplicate key"); - } - } - Ok(()) - } - NodeKind::Spread | NodeKind::Comma | NodeKind::Colon => Ok(()), - _ => Err("expected named or keyed pair"), - }); - marker.end(p, NodeKind::Dict); -} - -/// Convert a collection into a list of parameters, producing errors for -/// anything other than identifiers, spread operations and named pairs. -fn params(p: &mut Parser, marker: Marker) { - marker.filter_children(p, |x| match x.kind() { - kind if kind.is_paren() => Ok(()), - NodeKind::Named | NodeKind::Ident(_) | NodeKind::Comma => Ok(()), - NodeKind::Spread - if matches!( - x.children().last().map(|child| child.kind()), - Some(&NodeKind::Ident(_)) - ) => - { - Ok(()) - } - _ => Err("expected identifier, named pair or argument sink"), - }); - marker.end(p, NodeKind::Params); -} - -/// Parse a code block: `{...}`. -fn code_block(p: &mut Parser) { - p.perform(NodeKind::CodeBlock, |p| { - p.start_group(Group::Brace); - code(p); - p.end_group(); - }); -} - -/// Parse expressions. -fn code(p: &mut Parser) { - while !p.eof() { - p.start_group(Group::Expr); - if expr(p).is_ok() && !p.eof() { - p.expected("semicolon or line break"); - } - p.end_group(); - - // Forcefully skip over newlines since the group's contents can't. - p.eat_while(NodeKind::is_space); - } -} - -/// Parse a content block: `[...]`. -fn content_block(p: &mut Parser) { - p.perform(NodeKind::ContentBlock, |p| { - p.start_group(Group::Bracket); - markup(p, true); - p.end_group(); - }); -} - -/// Parse the arguments to a function call. -fn args(p: &mut Parser) -> ParseResult { - match p.peek_direct() { - Some(NodeKind::LeftParen) => {} - Some(NodeKind::LeftBracket) => {} - _ => { - p.expected_found("argument list"); - return Err(ParseError); - } - } - - p.perform(NodeKind::Args, |p| { - if p.at(NodeKind::LeftParen) { - let marker = p.marker(); - p.start_group(Group::Paren); - collection(p, false); - p.end_group(); - - let mut used = HashSet::new(); - marker.filter_children(p, |x| match x.kind() { - NodeKind::Named => { - if let Some(NodeKind::Ident(ident)) = - x.children().next().map(|child| child.kind()) - { - if !used.insert(ident.clone()) { - return Err("duplicate argument"); - } - } - Ok(()) - } - _ => Ok(()), - }); - } - - while p.peek_direct() == Some(&NodeKind::LeftBracket) { - content_block(p); - } - }); - - Ok(()) -} - -/// Parse a let expression. -fn let_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::LetBinding, |p| { - p.assert(NodeKind::Let); - - let marker = p.marker(); - ident(p)?; - - // If a parenthesis follows, this is a function definition. - let has_params = p.peek_direct() == Some(&NodeKind::LeftParen); - if has_params { - let marker = p.marker(); - p.start_group(Group::Paren); - collection(p, false); - p.end_group(); - params(p, marker); - } - - if p.eat_if(NodeKind::Eq) { - expr(p)?; - } else if has_params { - // Function definitions must have a body. - p.expected("body"); - } - - // Rewrite into a closure expression if it's a function definition. - if has_params { - marker.end(p, NodeKind::Closure); - } - - Ok(()) - }) -} - -/// Parse a set expression. -fn set_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::SetRule, |p| { - p.assert(NodeKind::Set); - ident(p)?; - args(p) - }) -} - -/// Parse a show expression. -fn show_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ShowRule, |p| { - p.assert(NodeKind::Show); - let marker = p.marker(); - expr(p)?; - if p.eat_if(NodeKind::Colon) { - marker.filter_children(p, |child| match child.kind() { - NodeKind::Ident(_) | NodeKind::Colon => Ok(()), - _ => Err("expected identifier"), - }); - expr(p)?; - } - p.expect(NodeKind::As)?; - expr(p) - }) -} - -/// Parse a wrap expression. -fn wrap_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::WrapRule, |p| { - p.assert(NodeKind::Wrap); - ident(p)?; - p.expect(NodeKind::In)?; - expr(p) - }) -} - -/// Parse an if-else expresion. -fn if_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::Conditional, |p| { - p.assert(NodeKind::If); - - expr(p)?; - body(p)?; - - if p.eat_if(NodeKind::Else) { - if p.at(NodeKind::If) { - if_expr(p)?; - } else { - body(p)?; - } - } - - Ok(()) - }) -} - -/// Parse a while expresion. -fn while_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::WhileLoop, |p| { - p.assert(NodeKind::While); - expr(p)?; - body(p) - }) -} - -/// Parse a for-in expression. -fn for_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ForLoop, |p| { - p.assert(NodeKind::For); - for_pattern(p)?; - p.expect(NodeKind::In)?; - expr(p)?; - body(p) - }) -} - -/// Parse a for loop pattern. -fn for_pattern(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ForPattern, |p| { - ident(p)?; - if p.eat_if(NodeKind::Comma) { - ident(p)?; - } - Ok(()) - }) -} - -/// Parse an import expression. -fn import_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ModuleImport, |p| { - p.assert(NodeKind::Import); - - if !p.eat_if(NodeKind::Star) { - // This is the list of identifiers scenario. - p.perform(NodeKind::ImportItems, |p| { - p.start_group(Group::Imports); - let marker = p.marker(); - let items = collection(p, false).1; - if items == 0 { - p.expected("import items"); - } - p.end_group(); - - marker.filter_children(p, |n| match n.kind() { - NodeKind::Ident(_) | NodeKind::Comma => Ok(()), - _ => Err("expected identifier"), - }); - }); - }; - - p.expect(NodeKind::From)?; - expr(p) - }) -} - -/// Parse an include expression. -fn include_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ModuleInclude, |p| { - p.assert(NodeKind::Include); - expr(p) - }) -} - -/// Parse a break expression. -fn break_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::BreakStmt, |p| { - p.assert(NodeKind::Break); - Ok(()) - }) -} - -/// Parse a continue expression. -fn continue_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ContinueStmt, |p| { - p.assert(NodeKind::Continue); - Ok(()) - }) -} - -/// Parse a return expression. -fn return_expr(p: &mut Parser) -> ParseResult { - p.perform(NodeKind::ReturnStmt, |p| { - p.assert(NodeKind::Return); - if !p.at(NodeKind::Comma) && !p.eof() { - expr(p)?; - } - Ok(()) - }) -} - -/// Parse a control flow body. -fn body(p: &mut Parser) -> ParseResult { - match p.peek() { - Some(NodeKind::LeftBracket) => Ok(content_block(p)), - Some(NodeKind::LeftBrace) => Ok(code_block(p)), - _ => { - p.expected("body"); - Err(ParseError) - } - } -} - -#[cfg(test)] -mod tests { - use std::fmt::Debug; - - #[track_caller] - pub fn check<T>(text: &str, found: T, expected: T) - where - T: Debug + PartialEq, - { - if found != expected { - println!("source: {text:?}"); - println!("expected: {expected:#?}"); - println!("found: {found:#?}"); - panic!("test failed"); - } - } -} diff --git a/src/parse/parser.rs b/src/parse/parser.rs deleted file mode 100644 index 3dbb7d50..00000000 --- a/src/parse/parser.rs +++ /dev/null @@ -1,559 +0,0 @@ -use std::fmt::{self, Display, Formatter}; -use std::mem; -use std::ops::Range; - -use super::{TokenMode, Tokens}; -use crate::syntax::{ErrorPos, InnerNode, NodeData, NodeKind, SyntaxNode}; -use crate::util::EcoString; - -/// A convenient token-based parser. -pub struct Parser<'s> { - /// An iterator over the source tokens. - tokens: Tokens<'s>, - /// Whether we are at the end of the file or of a group. - eof: bool, - /// The current token. - current: Option<NodeKind>, - /// The end byte index of the last non-trivia token. - prev_end: usize, - /// The start byte index of the peeked token. - current_start: usize, - /// The stack of open groups. - groups: Vec<GroupEntry>, - /// The children of the currently built node. - children: Vec<SyntaxNode>, - /// Whether the last group was not correctly terminated. - unterminated_group: bool, - /// Whether a group terminator was found that did not close a group. - stray_terminator: bool, -} - -impl<'s> Parser<'s> { - /// Create a new parser for the source string. - pub fn new(text: &'s str, mode: TokenMode) -> Self { - Self::with_prefix("", text, mode) - } - - /// Create a new parser for the source string that is prefixed by some text - /// that does not need to be parsed but taken into account for column - /// calculation. - pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { - let mut tokens = Tokens::with_prefix(prefix, text, mode); - let current = tokens.next(); - Self { - tokens, - eof: current.is_none(), - current, - prev_end: 0, - current_start: 0, - groups: vec![], - children: vec![], - unterminated_group: false, - stray_terminator: false, - } - } - - /// End the parsing process and return the parsed children. - pub fn finish(self) -> Vec<SyntaxNode> { - self.children - } - - /// End the parsing process and return - /// - the parsed children and whether the last token was terminated, if all - /// groups were terminated correctly, or - /// - `None` otherwise. - pub fn consume(self) -> Option<(Vec<SyntaxNode>, bool)> { - self.terminated().then(|| (self.children, self.tokens.terminated())) - } - - /// Create a new marker. - pub fn marker(&mut self) -> Marker { - Marker(self.children.len()) - } - - /// Create a marker right before the trailing trivia. - pub fn trivia_start(&self) -> Marker { - let count = self - .children - .iter() - .rev() - .take_while(|node| self.is_trivia(node.kind())) - .count(); - Marker(self.children.len() - count) - } - - /// Perform a subparse that wraps its result in a node with the given kind. - pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T - where - F: FnOnce(&mut Self) -> T, - { - let prev = mem::take(&mut self.children); - let output = f(self); - let until = self.trivia_start(); - let mut children = mem::replace(&mut self.children, prev); - - if self.tokens.mode() == TokenMode::Markup { - self.children.push(InnerNode::with_children(kind, children).into()); - } else { - // Trailing trivia should not be wrapped into the new node. - let idx = self.children.len(); - self.children.push(SyntaxNode::default()); - self.children.extend(children.drain(until.0 ..)); - self.children[idx] = InnerNode::with_children(kind, children).into(); - } - - output - } - - /// Whether the end of the source string or group is reached. - pub fn eof(&self) -> bool { - self.eof - } - - /// Consume the current token and also trailing trivia. - pub fn eat(&mut self) { - self.stray_terminator |= match self.current { - Some(NodeKind::RightParen) => !self.inside(Group::Paren), - Some(NodeKind::RightBracket) => !self.inside(Group::Bracket), - Some(NodeKind::RightBrace) => !self.inside(Group::Brace), - _ => false, - }; - - self.prev_end = self.tokens.cursor(); - self.bump(); - - if self.tokens.mode() != TokenMode::Markup { - // Skip whitespace and comments. - while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) { - self.bump(); - } - } - - self.repeek(); - } - - /// Consume the current token if it is the given one. - pub fn eat_if(&mut self, kind: NodeKind) -> bool { - let at = self.at(kind); - if at { - self.eat(); - } - at - } - - /// Eat tokens while the condition is true. - pub fn eat_while<F>(&mut self, mut f: F) - where - F: FnMut(&NodeKind) -> bool, - { - while self.peek().map_or(false, |t| f(t)) { - self.eat(); - } - } - - /// Consume the current token if it is the given one and produce an error if - /// not. - pub fn expect(&mut self, kind: NodeKind) -> ParseResult { - let at = self.peek() == Some(&kind); - if at { - self.eat(); - Ok(()) - } else { - self.expected(kind.name()); - Err(ParseError) - } - } - - /// Consume the current token, debug-asserting that it is the given one. - #[track_caller] - pub fn assert(&mut self, kind: NodeKind) { - debug_assert_eq!(self.peek(), Some(&kind)); - self.eat(); - } - - /// Whether the current token is of the given type. - pub fn at(&self, kind: NodeKind) -> bool { - self.peek() == Some(&kind) - } - - /// Peek at the current token without consuming it. - pub fn peek(&self) -> Option<&NodeKind> { - if self.eof { None } else { self.current.as_ref() } - } - - /// Peek at the current token, but only if it follows immediately after the - /// last one without any trivia in between. - pub fn peek_direct(&self) -> Option<&NodeKind> { - if self.prev_end() == self.current_start() { - self.peek() - } else { - None - } - } - - /// Peek at the source of the current token. - pub fn peek_src(&self) -> &'s str { - self.get(self.current_start() .. self.current_end()) - } - - /// Obtain a range of the source code. - pub fn get(&self, range: Range<usize>) -> &'s str { - self.tokens.scanner().get(range) - } - - /// The byte index at which the last non-trivia token ended. - pub fn prev_end(&self) -> usize { - self.prev_end - } - - /// The byte index at which the current token starts. - pub fn current_start(&self) -> usize { - self.current_start - } - - /// The byte index at which the current token ends. - pub fn current_end(&self) -> usize { - self.tokens.cursor() - } - - /// Determine the column index for the given byte index. - pub fn column(&self, index: usize) -> usize { - self.tokens.column(index) - } - - /// Continue parsing in a group. - /// - /// When the end delimiter of the group is reached, all subsequent calls to - /// `peek()` return `None`. Parsing can only continue with a matching call - /// to `end_group`. - /// - /// This panics if the current token does not start the given group. - #[track_caller] - pub fn start_group(&mut self, kind: Group) { - self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() }); - self.tokens.set_mode(match kind { - Group::Strong | Group::Emph => TokenMode::Markup, - Group::Bracket => match self.tokens.mode() { - TokenMode::Math => TokenMode::Math, - _ => TokenMode::Markup, - }, - Group::Brace | Group::Paren => match self.tokens.mode() { - TokenMode::Math => TokenMode::Math, - _ => TokenMode::Code, - }, - Group::Math => TokenMode::Math, - Group::Expr | Group::Imports => TokenMode::Code, - }); - - match kind { - Group::Brace => self.assert(NodeKind::LeftBrace), - Group::Bracket => self.assert(NodeKind::LeftBracket), - Group::Paren => self.assert(NodeKind::LeftParen), - Group::Strong => self.assert(NodeKind::Star), - Group::Emph => self.assert(NodeKind::Underscore), - Group::Math => self.assert(NodeKind::Dollar), - Group::Expr => self.repeek(), - Group::Imports => self.repeek(), - } - } - - /// End the parsing of a group. - /// - /// This panics if no group was started. - #[track_caller] - pub fn end_group(&mut self) { - let group_mode = self.tokens.mode(); - let group = self.groups.pop().expect("no started group"); - self.tokens.set_mode(group.prev_mode); - - let mut rescan = self.tokens.mode() != group_mode; - - // Eat the end delimiter if there is one. - if let Some((end, required)) = match group.kind { - Group::Brace => Some((NodeKind::RightBrace, true)), - Group::Bracket => Some((NodeKind::RightBracket, true)), - Group::Paren => Some((NodeKind::RightParen, true)), - Group::Strong => Some((NodeKind::Star, true)), - Group::Emph => Some((NodeKind::Underscore, true)), - Group::Math => Some((NodeKind::Dollar, true)), - Group::Expr => Some((NodeKind::Semicolon, false)), - Group::Imports => None, - } { - if self.current.as_ref() == Some(&end) { - // If another group closes after a group with the missing - // terminator, its scope of influence ends here and no longer - // taints the rest of the reparse. - self.unterminated_group = false; - - // Bump the delimeter and return. No need to rescan in this - // case. Also, we know that the delimiter is not stray even - // though we already removed the group. - let s = self.stray_terminator; - self.eat(); - self.stray_terminator = s; - rescan = false; - } else if required { - self.expected(end.name()); - self.unterminated_group = true; - } - } - - // Rescan the peeked token if the mode changed. - if rescan { - let mut target = self.prev_end(); - if group_mode != TokenMode::Markup { - let start = self.trivia_start().0; - target = self.current_start - - self.children[start ..].iter().map(SyntaxNode::len).sum::<usize>(); - self.children.truncate(start); - } - - self.tokens.jump(target); - self.prev_end = self.tokens.cursor(); - self.current_start = self.tokens.cursor(); - self.current = self.tokens.next(); - } - - self.repeek(); - } - - /// Checks if all groups were correctly terminated. - fn terminated(&self) -> bool { - self.groups.is_empty() && !self.unterminated_group && !self.stray_terminator - } - - /// Low-level bump that consumes exactly one token without special trivia - /// handling. - fn bump(&mut self) { - let kind = self.current.take().unwrap(); - let len = self.tokens.cursor() - self.current_start; - self.children.push(NodeData::new(kind, len).into()); - self.current_start = self.tokens.cursor(); - self.current = self.tokens.next(); - } - - /// Take another look at the current token to recheck whether it ends a - /// group. - fn repeek(&mut self) { - self.eof = match &self.current { - Some(NodeKind::RightBrace) => self.inside(Group::Brace), - Some(NodeKind::RightBracket) => self.inside(Group::Bracket), - Some(NodeKind::RightParen) => self.inside(Group::Paren), - Some(NodeKind::Star) => self.inside(Group::Strong), - Some(NodeKind::Underscore) => self.inside(Group::Emph), - Some(NodeKind::Dollar) => self.inside(Group::Math), - Some(NodeKind::Semicolon) => self.inside(Group::Expr), - Some(NodeKind::From) => self.inside(Group::Imports), - Some(NodeKind::Space { newlines }) => self.space_ends_group(*newlines), - Some(_) => false, - None => true, - }; - } - - /// Returns whether the given type can be skipped over. - fn is_trivia(&self, token: &NodeKind) -> bool { - match token { - NodeKind::Space { newlines } => !self.space_ends_group(*newlines), - NodeKind::LineComment => true, - NodeKind::BlockComment => true, - _ => false, - } - } - - /// Whether a space with the given number of newlines ends the current group. - fn space_ends_group(&self, n: usize) -> bool { - if n == 0 { - return false; - } - - match self.groups.last().map(|group| group.kind) { - Some(Group::Strong | Group::Emph) => n >= 2, - Some(Group::Imports) => n >= 1, - Some(Group::Expr) if n >= 1 => { - // Allow else and method call to continue on next line. - self.groups.iter().nth_back(1).map(|group| group.kind) - != Some(Group::Brace) - || !matches!( - self.tokens.clone().next(), - Some(NodeKind::Else | NodeKind::Dot) - ) - } - _ => false, - } - } - - /// Whether we are inside the given group (can be nested). - fn inside(&self, kind: Group) -> bool { - self.groups - .iter() - .rev() - .take_while(|g| !kind.is_weak() || g.kind.is_weak()) - .any(|g| g.kind == kind) - } -} - -/// Error handling. -impl Parser<'_> { - /// Eat the current token and add an error that it is unexpected. - pub fn unexpected(&mut self) { - if let Some(found) = self.peek() { - let msg = format_eco!("unexpected {}", found.name()); - let error = NodeKind::Error(ErrorPos::Full, msg); - self.perform(error, Self::eat); - } - } - - /// Add an error that the `thing` was expected at the end of the last - /// non-trivia token. - pub fn expected(&mut self, thing: &str) { - self.expected_at(self.trivia_start(), thing); - } - - /// Insert an error message that `what` was expected at the marker position. - pub fn expected_at(&mut self, marker: Marker, what: &str) { - let msg = format_eco!("expected {}", what); - let error = NodeKind::Error(ErrorPos::Full, msg); - self.children.insert(marker.0, NodeData::new(error, 0).into()); - } - - /// Eat the current token and add an error that it is not the expected - /// `thing`. - pub fn expected_found(&mut self, thing: &str) { - match self.peek() { - Some(found) => { - let msg = format_eco!("expected {}, found {}", thing, found.name()); - let error = NodeKind::Error(ErrorPos::Full, msg); - self.perform(error, Self::eat); - } - None => self.expected(thing), - } - } -} - -/// Marks a location in a parser's child list. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub struct Marker(usize); - -impl Marker { - /// Peek at the child directly before the marker. - pub fn before<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> { - p.children.get(self.0.checked_sub(1)?) - } - - /// Peek at the child directly after the marker. - pub fn after<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> { - p.children.get(self.0) - } - - /// Convert the child directly after marker. - pub fn convert(self, p: &mut Parser, kind: NodeKind) { - if let Some(child) = p.children.get_mut(self.0) { - child.convert(kind); - } - } - - /// Perform a subparse that wraps all children after the marker in a node - /// with the given kind. - pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T - where - F: FnOnce(&mut Parser) -> T, - { - let success = f(p); - self.end(p, kind); - success - } - - /// Wrap all children after the marker (excluding trailing trivia) in a node - /// with the given `kind`. - pub fn end(self, p: &mut Parser, kind: NodeKind) { - let until = p.trivia_start().0.max(self.0); - let children = p.children.drain(self.0 .. until).collect(); - p.children - .insert(self.0, InnerNode::with_children(kind, children).into()); - } - - /// Wrap all children that do not fulfill the predicate in error nodes. - pub fn filter_children<F>(self, p: &mut Parser, mut f: F) - where - F: FnMut(&SyntaxNode) -> Result<(), &'static str>, - { - for child in &mut p.children[self.0 ..] { - // Don't expose errors. - if child.kind().is_error() { - continue; - } - - // Don't expose trivia in code. - if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() { - continue; - } - - if let Err(msg) = f(child) { - let mut msg = EcoString::from(msg); - if msg.starts_with("expected") { - msg.push_str(", found "); - msg.push_str(child.kind().name()); - } - let error = NodeKind::Error(ErrorPos::Full, msg); - let inner = mem::take(child); - *child = InnerNode::with_child(error, inner).into(); - } - } - } -} - -/// A logical group of tokens, e.g. `[...]`. -#[derive(Debug)] -struct GroupEntry { - /// The kind of group this is. This decides which token(s) will end the - /// group. For example, a [`Group::Paren`] will be ended by - /// [`Token::RightParen`]. - pub kind: Group, - /// The mode the parser was in _before_ the group started (to which we go - /// back once the group ends). - pub prev_mode: TokenMode, -} - -/// A group, confined by optional start and end delimiters. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum Group { - /// A curly-braced group: `{...}`. - Brace, - /// A bracketed group: `[...]`. - Bracket, - /// A parenthesized group: `(...)`. - Paren, - /// A group surrounded with stars: `*...*`. - Strong, - /// A group surrounded with underscore: `_..._`. - Emph, - /// A group surrounded by dollar signs: `$...$`. - Math, - /// A group ended by a semicolon or a line break: `;`, `\n`. - Expr, - /// A group for import items, ended by a semicolon, line break or `from`. - Imports, -} - -impl Group { - /// Whether the group can only force other weak groups to end. - fn is_weak(self) -> bool { - matches!(self, Group::Strong | Group::Emph) - } -} - -/// Allows parser methods to use the try operator. Never returned top-level -/// because the parser recovers from all errors. -pub type ParseResult<T = ()> = Result<T, ParseError>; - -/// The error type for parsing. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub struct ParseError; - -impl Display for ParseError { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.pad("failed to parse") - } -} - -impl std::error::Error for ParseError {} diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs deleted file mode 100644 index 9fde0cf4..00000000 --- a/src/parse/resolve.rs +++ /dev/null @@ -1,238 +0,0 @@ -use unscanny::Scanner; - -use super::{is_ident, is_newline}; -use crate::syntax::RawKind; -use crate::util::EcoString; - -/// Resolve all escape sequences in a string. -pub fn resolve_string(string: &str) -> EcoString { - let mut out = EcoString::with_capacity(string.len()); - let mut s = Scanner::new(string); - - while let Some(c) = s.eat() { - if c != '\\' { - out.push(c); - continue; - } - - let start = s.locate(-1); - match s.eat() { - Some('\\') => out.push('\\'), - Some('"') => out.push('"'), - Some('n') => out.push('\n'), - Some('r') => out.push('\r'), - Some('t') => out.push('\t'), - Some('u') if s.eat_if('{') => { - // TODO: Error if closing brace is missing. - let sequence = s.eat_while(char::is_ascii_hexdigit); - let _terminated = s.eat_if('}'); - match resolve_hex(sequence) { - Some(c) => out.push(c), - None => out.push_str(s.from(start)), - } - } - - _ => out.push_str(s.from(start)), - } - } - - out -} - -/// Resolve a hexadecimal escape sequence into a character -/// (only the inner hex letters without braces or `\u`). -pub fn resolve_hex(sequence: &str) -> Option<char> { - u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) -} - -/// Resolve the language tag and trim the raw text. -pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawKind { - if backticks > 1 { - let (tag, inner) = split_at_lang_tag(text); - let (text, block) = trim_and_split_raw(column, inner); - RawKind { - lang: is_ident(tag).then(|| tag.into()), - text: text.into(), - block, - } - } else { - RawKind { - lang: None, - text: split_lines(text).join("\n").into(), - block: false, - } - } -} - -/// Parse the lang tag and return it alongside the remaining inner raw text. -fn split_at_lang_tag(raw: &str) -> (&str, &str) { - let mut s = Scanner::new(raw); - ( - s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)), - s.after(), - ) -} - -/// Trim raw text and splits it into lines. -/// -/// Also returns whether at least one newline was contained in `raw`. -fn trim_and_split_raw(column: usize, mut raw: &str) -> (String, bool) { - // Trims one space at the start. - raw = raw.strip_prefix(' ').unwrap_or(raw); - - // Trim one space at the end if the last non-whitespace char is a backtick. - if raw.trim_end().ends_with('`') { - raw = raw.strip_suffix(' ').unwrap_or(raw); - } - - let mut lines = split_lines(raw); - - // Dedent based on column, but not for the first line. - for line in lines.iter_mut().skip(1) { - let offset = line - .chars() - .take(column) - .take_while(|c| c.is_whitespace()) - .map(char::len_utf8) - .sum(); - *line = &line[offset ..]; - } - - let had_newline = lines.len() > 1; - let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); - - // Trims a sequence of whitespace followed by a newline at the start. - if lines.first().map_or(false, is_whitespace) { - lines.remove(0); - } - - // Trims a newline followed by a sequence of whitespace at the end. - if lines.last().map_or(false, is_whitespace) { - lines.pop(); - } - - (lines.join("\n"), had_newline) -} - -/// Split a string into a vector of lines -/// (respecting Unicode, Unix, Mac and Windows line breaks). -fn split_lines(text: &str) -> Vec<&str> { - let mut s = Scanner::new(text); - let mut lines = Vec::new(); - let mut start = 0; - let mut end = 0; - - while let Some(c) = s.eat() { - if is_newline(c) { - if c == '\r' { - s.eat_if('\n'); - } - - lines.push(&text[start .. end]); - start = s.cursor(); - } - end = s.cursor(); - } - - lines.push(&text[start ..]); - lines -} - -#[cfg(test)] -#[rustfmt::skip] -mod tests { - use super::*; - - #[test] - fn test_resolve_strings() { - #[track_caller] - fn test(string: &str, expected: &str) { - assert_eq!(resolve_string(string), expected); - } - - test(r#"hello world"#, "hello world"); - test(r#"hello\nworld"#, "hello\nworld"); - test(r#"a\"bc"#, "a\"bc"); - test(r#"a\u{2603}bc"#, "a☃bc"); - test(r#"a\u{26c3bg"#, "a𦰻g"); - test(r#"av\u{6797"#, "av林"); - test(r#"a\\"#, "a\\"); - test(r#"a\\\nbc"#, "a\\\nbc"); - test(r#"a\t\r\nbc"#, "a\t\r\nbc"); - test(r"🌎", "🌎"); - test(r"🌎\", r"🌎\"); - test(r"\🌎", r"\🌎"); - } - - #[test] - fn test_split_at_lang_tag() { - #[track_caller] - fn test(text: &str, lang: &str, inner: &str) { - assert_eq!(split_at_lang_tag(text), (lang, inner)); - } - - test("typst it!", "typst", " it!"); - test("typst\n it!", "typst", "\n it!"); - test("typst\n it!", "typst", "\n it!"); - test("abc`", "abc", "`"); - test(" hi", "", " hi"); - test("`", "", "`"); - } - - #[test] - fn test_resolve_raw() { - #[track_caller] - fn test( - column: usize, - backticks: usize, - raw: &str, - lang: Option<&str>, - text: &str, - block: bool, - ) { - let node = resolve_raw(column, backticks, raw); - assert_eq!(node.lang.as_deref(), lang); - assert_eq!(node.text, text); - assert_eq!(node.block, block); - } - - // Just one backtick. - test(0, 1, "py", None, "py", false); - test(0, 1, "1\n2", None, "1\n2", false); - test(0, 1, "1\r\n2", None, "1\n2", false); - - // More than one backtick with lang tag. - test(0, 2, "js alert()", Some("js"), "alert()", false); - test(0, 3, "py quit(\n\n)", Some("py"), "quit(\n\n)", true); - test(0, 2, "♥", None, "", false); - - // Trimming of whitespace (tested more thoroughly in separate test). - test(0, 2, " a", None, "a", false); - test(0, 2, " a", None, " a", false); - test(0, 2, " \na", None, "a", true); - - // Dedenting - test(2, 3, " def foo():\n bar()", None, "def foo():\n bar()", true); - } - - #[test] - fn test_trim_raw() { - #[track_caller] - fn test(text: &str, expected: &str) { - assert_eq!(trim_and_split_raw(0, text).0, expected); - } - - test(" hi", "hi"); - test(" hi", " hi"); - test("\nhi", "hi"); - test(" \n hi", " hi"); - test("hi` ", "hi`"); - test("hi` ", "hi` "); - test("hi` ", "hi` "); - test("hi ", "hi "); - test("hi ", "hi "); - test("hi\n", "hi"); - test("hi \n ", "hi "); - test(" \n hi \n ", " hi "); - } -} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs deleted file mode 100644 index 73c64d1e..00000000 --- a/src/parse/tokens.rs +++ /dev/null @@ -1,1176 +0,0 @@ -use std::sync::Arc; - -use unicode_xid::UnicodeXID; -use unscanny::Scanner; - -use super::resolve::{resolve_hex, resolve_raw, resolve_string}; -use crate::geom::{AngleUnit, LengthUnit}; -use crate::syntax::{ErrorPos, NodeKind, RawKind, Unit}; -use crate::util::EcoString; - -/// An iterator over the tokens of a string of source code. -#[derive(Clone)] -pub struct Tokens<'s> { - /// The underlying scanner. - s: Scanner<'s>, - /// The mode the scanner is in. This determines what tokens it recognizes. - mode: TokenMode, - /// Whether the last token has been terminated. - terminated: bool, - /// Offsets the indentation on the first line of the source. - column_offset: usize, -} - -/// What kind of tokens to emit. -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum TokenMode { - /// Text and markup. - Markup, - /// Math atoms, operators, etc. - Math, - /// Keywords, literals and operators. - Code, -} - -impl<'s> Tokens<'s> { - /// Create a new token iterator with the given mode. - #[inline] - pub fn new(text: &'s str, mode: TokenMode) -> Self { - Self::with_prefix("", text, mode) - } - - /// Create a new token iterator with the given mode and a prefix to offset - /// column calculations. - #[inline] - pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { - Self { - s: Scanner::new(text), - mode, - terminated: true, - column_offset: column(prefix, prefix.len(), 0), - } - } - - /// Get the current token mode. - #[inline] - pub fn mode(&self) -> TokenMode { - self.mode - } - - /// Change the token mode. - #[inline] - pub fn set_mode(&mut self, mode: TokenMode) { - self.mode = mode; - } - - /// The index in the string at which the last token ends and next token - /// will start. - #[inline] - pub fn cursor(&self) -> usize { - self.s.cursor() - } - - /// Jump to the given index in the string. - #[inline] - pub fn jump(&mut self, index: usize) { - self.s.jump(index); - } - - /// The underlying scanner. - #[inline] - pub fn scanner(&self) -> Scanner<'s> { - self.s - } - - /// Whether the last token was terminated. - #[inline] - pub fn terminated(&self) -> bool { - self.terminated - } - - /// The column index of a given index in the source string. - #[inline] - pub fn column(&self, index: usize) -> usize { - column(self.s.string(), index, self.column_offset) - } -} - -impl<'s> Iterator for Tokens<'s> { - type Item = NodeKind; - - /// Parse the next token in the source code. - #[inline] - fn next(&mut self) -> Option<Self::Item> { - let start = self.s.cursor(); - let c = self.s.eat()?; - Some(match c { - // Trivia. - '/' if self.s.eat_if('/') => self.line_comment(), - '/' if self.s.eat_if('*') => self.block_comment(), - '*' if self.s.eat_if('/') => { - NodeKind::Error(ErrorPos::Full, "unexpected end of block comment".into()) - } - c if c.is_whitespace() => self.whitespace(c), - - // Other things. - _ => match self.mode { - TokenMode::Markup => self.markup(start, c), - TokenMode::Math => self.math(start, c), - TokenMode::Code => self.code(start, c), - }, - }) - } -} - -impl<'s> Tokens<'s> { - fn line_comment(&mut self) -> NodeKind { - self.s.eat_until(is_newline); - if self.s.peek().is_none() { - self.terminated = false; - } - NodeKind::LineComment - } - - fn block_comment(&mut self) -> NodeKind { - let mut state = '_'; - let mut depth = 1; - self.terminated = false; - - // Find the first `*/` that does not correspond to a nested `/*`. - while let Some(c) = self.s.eat() { - state = match (state, c) { - ('*', '/') => { - depth -= 1; - if depth == 0 { - self.terminated = true; - break; - } - '_' - } - ('/', '*') => { - depth += 1; - '_' - } - ('/', '/') => { - self.line_comment(); - '_' - } - _ => c, - } - } - - NodeKind::BlockComment - } - - fn whitespace(&mut self, c: char) -> NodeKind { - if c == ' ' && !self.s.at(char::is_whitespace) { - return NodeKind::Space { newlines: 0 }; - } - - self.s.uneat(); - - // Count the number of newlines. - let mut newlines = 0; - while let Some(c) = self.s.eat() { - if !c.is_whitespace() { - self.s.uneat(); - break; - } - - if is_newline(c) { - if c == '\r' { - self.s.eat_if('\n'); - } - newlines += 1; - } - } - - NodeKind::Space { newlines } - } - - #[inline] - fn markup(&mut self, start: usize, c: char) -> NodeKind { - match c { - // Blocks. - '{' => NodeKind::LeftBrace, - '}' => NodeKind::RightBrace, - '[' => NodeKind::LeftBracket, - ']' => NodeKind::RightBracket, - - // Multi-char things. - '#' => self.hash(start), - '.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'), - '-' => self.hyph(), - 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { - self.link(start) - } - '`' => self.raw(), - c if c.is_ascii_digit() => self.numbering(start), - '<' => self.label(), - '@' => self.reference(start), - - // Escape sequences. - '\\' => self.backslash(), - - // Single-char things. - '~' => NodeKind::Shorthand('\u{00A0}'), - '\'' => NodeKind::SmartQuote { double: false }, - '"' => NodeKind::SmartQuote { double: true }, - '*' if !self.in_word() => NodeKind::Star, - '_' if !self.in_word() => NodeKind::Underscore, - '$' => NodeKind::Dollar, - '=' => NodeKind::Eq, - '+' => NodeKind::Plus, - '/' => NodeKind::Slash, - ':' => NodeKind::Colon, - - // Plain text. - _ => self.text(start), - } - } - - #[inline] - fn text(&mut self, start: usize) -> NodeKind { - macro_rules! table { - ($(|$c:literal)*) => {{ - let mut t = [false; 128]; - $(t[$c as usize] = true;)* - t - }} - } - - const TABLE: [bool; 128] = table! { - | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' - | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' - | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' - }; - - loop { - self.s.eat_until(|c: char| { - TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) - }); - - // Continue with the same text node if the thing would become text - // anyway. - let mut s = self.s; - match s.eat() { - Some('/') if !s.at(['/', '*']) => {} - Some(' ') if s.at(char::is_alphanumeric) => {} - Some('-') if !s.at(['-', '?']) => {} - Some('.') if !s.at("..") => {} - Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} - Some('@' | '#') if !s.at(is_id_start) => {} - _ => break, - } - - self.s = s; - } - - NodeKind::Text(self.s.from(start).into()) - } - - fn backslash(&mut self) -> NodeKind { - match self.s.peek() { - Some('u') if self.s.eat_if("u{") => { - let sequence = self.s.eat_while(char::is_ascii_alphanumeric); - if self.s.eat_if('}') { - if let Some(c) = resolve_hex(sequence) { - NodeKind::Escape(c) - } else { - NodeKind::Error( - ErrorPos::Full, - "invalid unicode escape sequence".into(), - ) - } - } else { - self.terminated = false; - NodeKind::Error(ErrorPos::End, "expected closing brace".into()) - } - } - - // Linebreaks. - Some(c) if c.is_whitespace() => NodeKind::Linebreak, - None => NodeKind::Linebreak, - - // Escapes. - Some(c) => { - self.s.expect(c); - NodeKind::Escape(c) - } - } - } - - fn hash(&mut self, start: usize) -> NodeKind { - if self.s.at(is_id_start) { - let read = self.s.eat_while(is_id_continue); - match keyword(read) { - Some(keyword) => keyword, - None => NodeKind::Ident(read.into()), - } - } else { - self.text(start) - } - } - - fn hyph(&mut self) -> NodeKind { - if self.s.eat_if('-') { - if self.s.eat_if('-') { - NodeKind::Shorthand('\u{2014}') - } else { - NodeKind::Shorthand('\u{2013}') - } - } else if self.s.eat_if('?') { - NodeKind::Shorthand('\u{00AD}') - } else { - NodeKind::Minus - } - } - - fn link(&mut self, start: usize) -> NodeKind { - #[rustfmt::skip] - self.s.eat_while(|c: char| matches!(c, - | '0' ..= '9' - | 'a' ..= 'z' - | 'A' ..= 'Z' - | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' - | '\'' | '.' | ',' | ';' - )); - if self.s.scout(-1) == Some('.') { - self.s.uneat(); - } - NodeKind::Link(self.s.from(start).into()) - } - - fn raw(&mut self) -> NodeKind { - let column = self.column(self.s.cursor() - 1); - - let mut backticks = 1; - while self.s.eat_if('`') { - backticks += 1; - } - - // Special case for empty inline block. - if backticks == 2 { - return NodeKind::Raw(Arc::new(RawKind { - text: EcoString::new(), - lang: None, - block: false, - })); - } - - let start = self.s.cursor(); - let mut found = 0; - while found < backticks { - match self.s.eat() { - Some('`') => found += 1, - Some(_) => found = 0, - None => break, - } - } - - if found == backticks { - let end = self.s.cursor() - found as usize; - NodeKind::Raw(Arc::new(resolve_raw( - column, - backticks, - self.s.get(start .. end), - ))) - } else { - self.terminated = false; - let remaining = backticks - found; - let noun = if remaining == 1 { "backtick" } else { "backticks" }; - NodeKind::Error( - ErrorPos::End, - if found == 0 { - format_eco!("expected {} {}", remaining, noun) - } else { - format_eco!("expected {} more {}", remaining, noun) - }, - ) - } - } - - fn numbering(&mut self, start: usize) -> NodeKind { - self.s.eat_while(char::is_ascii_digit); - let read = self.s.from(start); - if self.s.eat_if('.') { - if let Ok(number) = read.parse() { - return NodeKind::EnumNumbering(number); - } - } - - self.text(start) - } - - fn label(&mut self) -> NodeKind { - let label = self.s.eat_while(is_id_continue); - if self.s.eat_if('>') { - if !label.is_empty() { - NodeKind::Label(label.into()) - } else { - NodeKind::Error(ErrorPos::Full, "label cannot be empty".into()) - } - } else { - self.terminated = false; - NodeKind::Error(ErrorPos::End, "expected closing angle bracket".into()) - } - } - - fn reference(&mut self, start: usize) -> NodeKind { - let label = self.s.eat_while(is_id_continue); - if !label.is_empty() { - NodeKind::Ref(label.into()) - } else { - self.text(start) - } - } - - fn math(&mut self, start: usize, c: char) -> NodeKind { - match c { - // Escape sequences. - '\\' => self.backslash(), - - // Single-char things. - '_' => NodeKind::Underscore, - '^' => NodeKind::Hat, - '/' => NodeKind::Slash, - '&' => NodeKind::Amp, - '$' => NodeKind::Dollar, - - // Brackets. - '{' => NodeKind::LeftBrace, - '}' => NodeKind::RightBrace, - '[' => NodeKind::LeftBracket, - ']' => NodeKind::RightBracket, - '(' => NodeKind::LeftParen, - ')' => NodeKind::RightParen, - - // Identifiers. - c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { - self.s.eat_while(is_math_id_continue); - NodeKind::Ident(self.s.from(start).into()) - } - - // Numbers. - c if c.is_numeric() => { - self.s.eat_while(char::is_numeric); - NodeKind::Atom(self.s.from(start).into()) - } - - // Other math atoms. - c => NodeKind::Atom(c.into()), - } - } - - fn code(&mut self, start: usize, c: char) -> NodeKind { - match c { - // Blocks. - '{' => NodeKind::LeftBrace, - '}' => NodeKind::RightBrace, - '[' => NodeKind::LeftBracket, - ']' => NodeKind::RightBracket, - - // Parentheses. - '(' => NodeKind::LeftParen, - ')' => NodeKind::RightParen, - - // Two-char operators. - '=' if self.s.eat_if('=') => NodeKind::EqEq, - '!' if self.s.eat_if('=') => NodeKind::ExclEq, - '<' if self.s.eat_if('=') => NodeKind::LtEq, - '>' if self.s.eat_if('=') => NodeKind::GtEq, - '+' if self.s.eat_if('=') => NodeKind::PlusEq, - '-' if self.s.eat_if('=') => NodeKind::HyphEq, - '*' if self.s.eat_if('=') => NodeKind::StarEq, - '/' if self.s.eat_if('=') => NodeKind::SlashEq, - '.' if self.s.eat_if('.') => NodeKind::Dots, - '=' if self.s.eat_if('>') => NodeKind::Arrow, - - // Single-char operators. - ',' => NodeKind::Comma, - ';' => NodeKind::Semicolon, - ':' => NodeKind::Colon, - '+' => NodeKind::Plus, - '-' => NodeKind::Minus, - '*' => NodeKind::Star, - '/' => NodeKind::Slash, - '=' => NodeKind::Eq, - '<' => NodeKind::Lt, - '>' => NodeKind::Gt, - '.' if !self.s.at(char::is_ascii_digit) => NodeKind::Dot, - - // Identifiers. - c if is_id_start(c) => self.ident(start), - - // Numbers. - c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => { - self.number(start, c) - } - - // Strings. - '"' => self.string(), - - // Invalid token. - _ => NodeKind::Error(ErrorPos::Full, "not valid here".into()), - } - } - - fn ident(&mut self, start: usize) -> NodeKind { - self.s.eat_while(is_id_continue); - match self.s.from(start) { - "none" => NodeKind::None, - "auto" => NodeKind::Auto, - "true" => NodeKind::Bool(true), - "false" => NodeKind::Bool(false), - id => keyword(id).unwrap_or_else(|| NodeKind::Ident(id.into())), - } - } - - fn number(&mut self, start: usize, c: char) -> NodeKind { - // Read the first part (integer or fractional depending on `first`). - self.s.eat_while(char::is_ascii_digit); - - // Read the fractional part if not already done. - // Make sure not to confuse a range for the decimal separator. - if c != '.' && !self.s.at("..") && self.s.eat_if('.') { - self.s.eat_while(char::is_ascii_digit); - } - - // Read the exponent. - if !self.s.at("em") && self.s.eat_if(['e', 'E']) { - self.s.eat_if(['+', '-']); - self.s.eat_while(char::is_ascii_digit); - } - - // Read the suffix. - let suffix_start = self.s.cursor(); - if !self.s.eat_if('%') { - self.s.eat_while(char::is_ascii_alphanumeric); - } - - let number = self.s.get(start .. suffix_start); - let suffix = self.s.from(suffix_start); - - // Find out whether it is a simple number. - if suffix.is_empty() { - if let Ok(i) = number.parse::<i64>() { - return NodeKind::Int(i); - } - } - - let v = match number.parse::<f64>() { - Ok(v) => v, - Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()), - }; - - match suffix { - "" => NodeKind::Float(v), - "pt" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Pt)), - "mm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Mm)), - "cm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Cm)), - "in" => NodeKind::Numeric(v, Unit::Length(LengthUnit::In)), - "deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)), - "rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)), - "em" => NodeKind::Numeric(v, Unit::Em), - "fr" => NodeKind::Numeric(v, Unit::Fr), - "%" => NodeKind::Numeric(v, Unit::Percent), - _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()), - } - } - - fn string(&mut self) -> NodeKind { - let mut escaped = false; - let verbatim = self.s.eat_until(|c| { - if c == '"' && !escaped { - true - } else { - escaped = c == '\\' && !escaped; - false - } - }); - - let string = resolve_string(verbatim); - if self.s.eat_if('"') { - NodeKind::Str(string) - } else { - self.terminated = false; - NodeKind::Error(ErrorPos::End, "expected quote".into()) - } - } - - fn in_word(&self) -> bool { - let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); - let prev = self.s.scout(-2); - let next = self.s.peek(); - alphanumeric(prev) && alphanumeric(next) - } -} - -fn keyword(ident: &str) -> Option<NodeKind> { - Some(match ident { - "not" => NodeKind::Not, - "and" => NodeKind::And, - "or" => NodeKind::Or, - "let" => NodeKind::Let, - "set" => NodeKind::Set, - "show" => NodeKind::Show, - "wrap" => NodeKind::Wrap, - "if" => NodeKind::If, - "else" => NodeKind::Else, - "for" => NodeKind::For, - "in" => NodeKind::In, - "as" => NodeKind::As, - "while" => NodeKind::While, - "break" => NodeKind::Break, - "continue" => NodeKind::Continue, - "return" => NodeKind::Return, - "import" => NodeKind::Import, - "include" => NodeKind::Include, - "from" => NodeKind::From, - _ => return None, - }) -} - -/// The column index of a given index in the source string, given a column -/// offset for the first line. -#[inline] -fn column(string: &str, index: usize, offset: usize) -> usize { - let mut apply_offset = false; - let res = string[.. index] - .char_indices() - .rev() - .take_while(|&(_, c)| !is_newline(c)) - .inspect(|&(i, _)| { - if i == 0 { - apply_offset = true - } - }) - .count(); - - // The loop is never executed if the slice is empty, but we are of - // course still at the start of the first line. - if index == 0 { - apply_offset = true; - } - - if apply_offset { res + offset } else { res } -} - -/// Whether this character denotes a newline. -#[inline] -pub fn is_newline(character: char) -> bool { - matches!( - character, - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' - ) -} - -/// Whether a string is a valid unicode identifier. -/// -/// In addition to what is specified in the [Unicode Standard][uax31], we allow: -/// - `_` as a starting character, -/// - `_` and `-` as continuing characters. -/// -/// [uax31]: http://www.unicode.org/reports/tr31/ -#[inline] -pub fn is_ident(string: &str) -> bool { - let mut chars = string.chars(); - chars - .next() - .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) -} - -/// Whether a character can start an identifier. -#[inline] -fn is_id_start(c: char) -> bool { - c.is_xid_start() || c == '_' -} - -/// Whether a character can continue an identifier. -#[inline] -fn is_id_continue(c: char) -> bool { - c.is_xid_continue() || c == '_' || c == '-' -} - -/// Whether a character can start an identifier in math. -#[inline] -fn is_math_id_start(c: char) -> bool { - c.is_xid_start() -} - -/// Whether a character can continue an identifier in math. -#[inline] -fn is_math_id_continue(c: char) -> bool { - c.is_xid_continue() && c != '_' -} - -#[cfg(test)] -#[allow(non_snake_case)] -mod tests { - use super::*; - use crate::parse::tests::check; - - use ErrorPos::*; - use NodeKind::*; - use Option::None; - use TokenMode::{Code, Markup}; - - fn Space(newlines: usize) -> NodeKind { - NodeKind::Space { newlines } - } - - fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind { - NodeKind::Raw(Arc::new(RawKind { - text: text.into(), - lang: lang.map(Into::into), - block, - })) - } - - fn Str(string: &str) -> NodeKind { - NodeKind::Str(string.into()) - } - - fn Text(string: &str) -> NodeKind { - NodeKind::Text(string.into()) - } - - fn Ident(ident: &str) -> NodeKind { - NodeKind::Ident(ident.into()) - } - - fn Error(pos: ErrorPos, message: &str) -> NodeKind { - NodeKind::Error(pos, message.into()) - } - - /// Building blocks for suffix testing. - /// - /// We extend each test case with a collection of different suffixes to make - /// sure tokens end at the correct position. These suffixes are split into - /// blocks, which can be disabled/enabled per test case. For example, when - /// testing identifiers we disable letter suffixes because these would - /// mingle with the identifiers. - /// - /// Suffix blocks: - /// - ' ': spacing - /// - 'a': letters - /// - '1': numbers - /// - '/': symbols - const BLOCKS: &str = " a1/"; - - // Suffixes described by four-tuples of: - // - // - block the suffix is part of - // - mode in which the suffix is applicable - // - the suffix string - // - the resulting suffix NodeKind - fn suffixes() - -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> { - [ - // Whitespace suffixes. - (' ', None, " ", Space(0)), - (' ', None, "\n", Space(1)), - (' ', None, "\r", Space(1)), - (' ', None, "\r\n", Space(1)), - // Letter suffixes. - ('a', Some(Markup), "hello", Text("hello")), - ('a', Some(Markup), "💚", Text("💚")), - ('a', Some(Code), "val", Ident("val")), - ('a', Some(Code), "α", Ident("α")), - ('a', Some(Code), "_", Ident("_")), - // Number suffixes. - ('1', Some(Code), "2", Int(2)), - ('1', Some(Code), ".2", Float(0.2)), - // Symbol suffixes. - ('/', None, "[", LeftBracket), - ('/', None, "//", LineComment), - ('/', None, "/**/", BlockComment), - ('/', Some(Markup), "*", Star), - ('/', Some(Markup), r"\\", Escape('\\')), - ('/', Some(Markup), "#let", Let), - ('/', Some(Code), "(", LeftParen), - ('/', Some(Code), ":", Colon), - ('/', Some(Code), "+=", PlusEq), - ] - .into_iter() - } - - macro_rules! t { - (Both $($tts:tt)*) => { - t!(Markup $($tts)*); - t!(Code $($tts)*); - }; - ($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{ - // Test without suffix. - t!(@$mode: $text => $($token),*); - - // Test with each applicable suffix. - for (block, mode, suffix, ref token) in suffixes() { - let text = $text; - #[allow(unused_variables)] - let blocks = BLOCKS; - $(let blocks = $blocks;)? - assert!(!blocks.contains(|c| !BLOCKS.contains(c))); - if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) { - t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token); - } - } - }}; - (@$mode:ident: $text:expr => $($token:expr),*) => {{ - let text = $text; - let found = Tokens::new(&text, $mode).collect::<Vec<_>>(); - let expected = vec![$($token.clone()),*]; - check(&text, found, expected); - }}; - } - - #[test] - fn test_tokenize_brackets() { - // Test in markup. - t!(Markup: "{" => LeftBrace); - t!(Markup: "}" => RightBrace); - t!(Markup: "[" => LeftBracket); - t!(Markup: "]" => RightBracket); - t!(Markup[" /"]: "(" => Text("(")); - t!(Markup[" /"]: ")" => Text(")")); - - // Test in code. - t!(Code: "{" => LeftBrace); - t!(Code: "}" => RightBrace); - t!(Code: "[" => LeftBracket); - t!(Code: "]" => RightBracket); - t!(Code: "(" => LeftParen); - t!(Code: ")" => RightParen); - } - - #[test] - fn test_tokenize_whitespace() { - // Test basic whitespace. - t!(Both["a1/"]: "" => ); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: " " => Space(0)); - t!(Both["a1/"]: "\t" => Space(0)); - t!(Both["a1/"]: " \t" => Space(0)); - t!(Both["a1/"]: "\u{202F}" => Space(0)); - - // Test newline counting. - t!(Both["a1/"]: "\n" => Space(1)); - t!(Both["a1/"]: "\n " => Space(1)); - t!(Both["a1/"]: " \n" => Space(1)); - t!(Both["a1/"]: " \n " => Space(1)); - t!(Both["a1/"]: "\r\n" => Space(1)); - t!(Both["a1/"]: "\r\n\r" => Space(2)); - t!(Both["a1/"]: " \n\t \n " => Space(2)); - t!(Both["a1/"]: "\n\r" => Space(2)); - t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); - } - - #[test] - fn test_tokenize_text() { - // Test basic text. - t!(Markup[" /"]: "hello" => Text("hello")); - t!(Markup[" /"]: "reha-world" => Text("reha-world")); - - // Test code symbols in text. - t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b")); - t!(Markup[" /"]: ";,|/+" => Text(";,|/+")); - t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a")); - t!(Markup[" "]: "#123" => Text("#123")); - - // Test text ends. - t!(Markup[""]: "hello " => Text("hello"), Space(0)); - t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}')); - } - - #[test] - fn test_tokenize_escape_sequences() { - // Test escapable symbols. - t!(Markup: r"\\" => Escape('\\')); - t!(Markup: r"\/" => Escape('/')); - t!(Markup: r"\[" => Escape('[')); - t!(Markup: r"\]" => Escape(']')); - t!(Markup: r"\{" => Escape('{')); - t!(Markup: r"\}" => Escape('}')); - t!(Markup: r"\*" => Escape('*')); - t!(Markup: r"\_" => Escape('_')); - t!(Markup: r"\=" => Escape('=')); - t!(Markup: r"\~" => Escape('~')); - t!(Markup: r"\'" => Escape('\'')); - t!(Markup: r#"\""# => Escape('"')); - t!(Markup: r"\`" => Escape('`')); - t!(Markup: r"\$" => Escape('$')); - t!(Markup: r"\#" => Escape('#')); - t!(Markup: r"\a" => Escape('a')); - t!(Markup: r"\u" => Escape('u')); - t!(Markup: r"\1" => Escape('1')); - - // Test basic unicode escapes. - t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence")); - t!(Markup: r"\u{2603}" => Escape('☃')); - t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence")); - - // Test unclosed unicode escapes. - t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace")); - t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace")); - t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace); - } - - #[test] - fn test_tokenize_markup_symbols() { - // Test markup tokens. - t!(Markup[" a1"]: "*" => Star); - t!(Markup: "_" => Underscore); - t!(Markup[""]: "===" => Eq, Eq, Eq); - t!(Markup["a1/"]: "= " => Eq, Space(0)); - t!(Markup[" "]: r"\" => Linebreak); - t!(Markup: "~" => Shorthand('\u{00A0}')); - t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}')); - t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}')); - t!(Markup["a1/"]: "- " => Minus, Space(0)); - t!(Markup[" "]: "+" => Plus); - t!(Markup[" "]: "1." => EnumNumbering(1)); - t!(Markup[" "]: "1.a" => EnumNumbering(1), Text("a")); - t!(Markup[" /"]: "a1." => Text("a1.")); - } - - #[test] - fn test_tokenize_code_symbols() { - // Test all symbols. - t!(Code: "," => Comma); - t!(Code: ";" => Semicolon); - t!(Code: ":" => Colon); - t!(Code: "+" => Plus); - t!(Code: "-" => Minus); - t!(Code[" a1"]: "*" => Star); - t!(Code[" a1"]: "/" => Slash); - t!(Code[" a/"]: "." => Dot); - t!(Code: "=" => Eq); - t!(Code: "==" => EqEq); - t!(Code: "!=" => ExclEq); - t!(Code: "<" => Lt); - t!(Code: "<=" => LtEq); - t!(Code: ">" => Gt); - t!(Code: ">=" => GtEq); - t!(Code: "+=" => PlusEq); - t!(Code: "-=" => HyphEq); - t!(Code: "*=" => StarEq); - t!(Code: "/=" => SlashEq); - t!(Code: ".." => Dots); - t!(Code: "=>" => Arrow); - - // Test combinations. - t!(Code: "<=>" => LtEq, Gt); - t!(Code[" a/"]: "..." => Dots, Dot); - - // Test hyphen as symbol vs part of identifier. - t!(Code[" /"]: "-1" => Minus, Int(1)); - t!(Code[" /"]: "-a" => Minus, Ident("a")); - t!(Code[" /"]: "--1" => Minus, Minus, Int(1)); - t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a")); - t!(Code[" /"]: "a-b" => Ident("a-b")); - - // Test invalid. - t!(Code: r"\" => Error(Full, "not valid here")); - } - - #[test] - fn test_tokenize_keywords() { - // A list of a few (not all) keywords. - let list = [ - ("not", Not), - ("let", Let), - ("if", If), - ("else", Else), - ("for", For), - ("in", In), - ("import", Import), - ]; - - for (s, t) in list.clone() { - t!(Markup[" "]: format!("#{}", s) => t); - t!(Markup[" "]: format!("#{0}#{0}", s) => t, t); - t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}"))); - } - - for (s, t) in list { - t!(Code[" "]: s => t); - t!(Markup[" /"]: s => Text(s)); - } - - // Test simple identifier. - t!(Markup[" "]: "#letter" => Ident("letter")); - t!(Code[" /"]: "falser" => Ident("falser")); - t!(Code[" /"]: "None" => Ident("None")); - t!(Code[" /"]: "True" => Ident("True")); - } - - #[test] - fn test_tokenize_raw_blocks() { - // Test basic raw block. - t!(Markup: "``" => Raw("", None, false)); - t!(Markup: "`raw`" => Raw("raw", None, false)); - t!(Markup[""]: "`]" => Error(End, "expected 1 backtick")); - - // Test special symbols in raw block. - t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false)); - t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick")); - - // Test separated closing backticks. - t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false)); - - // Test more backticks. - t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false)); - t!(Markup: "````🚀````" => Raw("", None, false)); - t!(Markup[""]: "`````👩🚀````noend" => Error(End, "expected 5 backticks")); - t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false)); - } - - #[test] - fn test_tokenize_idents() { - // Test valid identifiers. - t!(Code[" /"]: "x" => Ident("x")); - t!(Code[" /"]: "value" => Ident("value")); - t!(Code[" /"]: "__main__" => Ident("__main__")); - t!(Code[" /"]: "_snake_case" => Ident("_snake_case")); - - // Test non-ascii. - t!(Code[" /"]: "α" => Ident("α")); - t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ")); - - // Test hyphen parsed as identifier. - t!(Code[" /"]: "kebab-case" => Ident("kebab-case")); - t!(Code[" /"]: "one-10" => Ident("one-10")); - } - - #[test] - fn test_tokenize_numeric() { - let ints = [("7", 7), ("012", 12)]; - let floats = [ - (".3", 0.3), - ("0.3", 0.3), - ("3.", 3.0), - ("3.0", 3.0), - ("14.3", 14.3), - ("10e2", 1000.0), - ("10e+0", 10.0), - ("10e+1", 100.0), - ("10e-2", 0.1), - ("10.e1", 100.0), - ("10.e-1", 1.0), - (".1e1", 1.0), - ("10E2", 1000.0), - ]; - - // Test integers. - for &(s, v) in &ints { - t!(Code[" /"]: s => Int(v)); - } - - // Test floats. - for &(s, v) in &floats { - t!(Code[" /"]: s => Float(v)); - } - - // Test attached numbers. - t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3)); - t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3)); - t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3)); - - // Test float from too large integer. - let large = i64::MAX as f64 + 1.0; - t!(Code[" /"]: large.to_string() => Float(large)); - - // Combined integers and floats. - let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats); - - let suffixes: &[(&str, fn(f64) -> NodeKind)] = &[ - ("mm", |x| Numeric(x, Unit::Length(LengthUnit::Mm))), - ("pt", |x| Numeric(x, Unit::Length(LengthUnit::Pt))), - ("cm", |x| Numeric(x, Unit::Length(LengthUnit::Cm))), - ("in", |x| Numeric(x, Unit::Length(LengthUnit::In))), - ("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))), - ("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))), - ("em", |x| Numeric(x, Unit::Em)), - ("fr", |x| Numeric(x, Unit::Fr)), - ("%", |x| Numeric(x, Unit::Percent)), - ]; - - // Numeric types. - for &(suffix, build) in suffixes { - for (s, v) in nums.clone() { - t!(Code[" /"]: format!("{}{}", s, suffix) => build(v)); - } - } - - // Multiple dots close the number. - t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2)); - t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3)); - t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3)); - - // Test invalid. - t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix")); - } - - #[test] - fn test_tokenize_strings() { - // Test basic strings. - t!(Code: "\"hi\"" => Str("hi")); - t!(Code: "\"hi\nthere\"" => Str("hi\nthere")); - t!(Code: "\"🌎\"" => Str("🌎")); - - // Test unterminated. - t!(Code[""]: "\"hi" => Error(End, "expected quote")); - - // Test escaped quote. - t!(Code: r#""a\"bc""# => Str("a\"bc")); - t!(Code[""]: r#""\""# => Error(End, "expected quote")); - } - - #[test] - fn test_tokenize_line_comments() { - // Test line comment with no trailing newline. - t!(Both[""]: "//" => LineComment); - - // Test line comment ends at newline. - t!(Both["a1/"]: "//bc\n" => LineComment, Space(1)); - t!(Both["a1/"]: "// bc \n" => LineComment, Space(1)); - t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1)); - - // Test nested line comments. - t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1)); - } - - #[test] - fn test_tokenize_block_comments() { - // Test basic block comments. - t!(Both[""]: "/*" => BlockComment); - t!(Both: "/**/" => BlockComment); - t!(Both: "/*🏞*/" => BlockComment); - t!(Both: "/*\n*/" => BlockComment); - - // Test depth 1 and 2 nested block comments. - t!(Both: "/* /* */ */" => BlockComment); - t!(Both: "/*/*/**/*/*/" => BlockComment); - - // Test two nested, one unclosed block comments. - t!(Both[""]: "/*/*/**/*/" => BlockComment); - - // Test all combinations of up to two following slashes and stars. - t!(Both[""]: "/*" => BlockComment); - t!(Both[""]: "/*/" => BlockComment); - t!(Both[""]: "/**" => BlockComment); - t!(Both[""]: "/*//" => BlockComment); - t!(Both[""]: "/*/*" => BlockComment); - t!(Both[""]: "/**/" => BlockComment); - t!(Both[""]: "/***" => BlockComment); - - // Test unexpected terminator. - t!(Both: "/*Hi*/*/" => BlockComment, - Error(Full, "unexpected end of block comment")); - } -} |
