summaryrefslogtreecommitdiff
path: root/src/parse
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/incremental.rs520
-rw-r--r--src/parse/mod.rs1166
-rw-r--r--src/parse/parser.rs559
-rw-r--r--src/parse/resolve.rs238
-rw-r--r--src/parse/tokens.rs1176
5 files changed, 0 insertions, 3659 deletions
diff --git a/src/parse/incremental.rs b/src/parse/incremental.rs
deleted file mode 100644
index 4651a784..00000000
--- a/src/parse/incremental.rs
+++ /dev/null
@@ -1,520 +0,0 @@
-use std::ops::Range;
-use std::sync::Arc;
-
-use crate::syntax::{InnerNode, NodeKind, Span, SyntaxNode};
-
-use super::{
- is_newline, parse, reparse_code_block, reparse_content_block, reparse_markup_elements,
-};
-
-/// Refresh the given syntax node with as little parsing as possible.
-///
-/// Takes the new source, the range in the old source that was replaced and the
-/// length of the replacement.
-///
-/// Returns the range in the new source that was ultimately reparsed.
-pub fn reparse(
- root: &mut SyntaxNode,
- text: &str,
- replaced: Range<usize>,
- replacement_len: usize,
-) -> Range<usize> {
- if let SyntaxNode::Inner(inner) = root {
- let change = Change { text, replaced, replacement_len };
- if let Some(range) = try_reparse(&change, Arc::make_mut(inner), 0, true, true) {
- return range;
- }
- }
-
- let id = root.span().source();
- *root = parse(text);
- root.numberize(id, Span::FULL).unwrap();
- 0 .. text.len()
-}
-
-/// Try to reparse inside the given node.
-fn try_reparse(
- change: &Change,
- node: &mut InnerNode,
- mut offset: usize,
- outermost: bool,
- safe_to_replace: bool,
-) -> Option<Range<usize>> {
- let is_markup = matches!(node.kind(), NodeKind::Markup { .. });
- let original_count = node.children().len();
- let original_offset = offset;
-
- let mut search = SearchState::default();
- let mut ahead: Option<Ahead> = None;
-
- // Whether the first node that should be replaced is at start.
- let mut at_start = true;
-
- // Whether the last searched child is the outermost child.
- let mut child_outermost = false;
-
- // Find the the first child in the range of children to reparse.
- for (i, child) in node.children().enumerate() {
- let pos = NodePos { idx: i, offset };
- let child_span = offset .. offset + child.len();
- child_outermost = outermost && i + 1 == original_count;
-
- match search {
- SearchState::NoneFound => {
- // The edit is contained within the span of the current element.
- if child_span.contains(&change.replaced.start)
- && child_span.end >= change.replaced.end
- {
- // In Markup mode, we want to consider a non-whitespace
- // neighbor if the edit is on the node boundary.
- search = if is_markup && child_span.end == change.replaced.end {
- SearchState::RequireNonTrivia(pos)
- } else {
- SearchState::Contained(pos)
- };
- } else if child_span.contains(&change.replaced.start) {
- search = SearchState::Inside(pos);
- } else if child_span.end == change.replaced.start
- && change.replaced.start == change.replaced.end
- && child_outermost
- {
- search = SearchState::SpanFound(pos, pos);
- } else {
- // Update compulsary state of `ahead_nontrivia`.
- if let Some(ahead_nontrivia) = ahead.as_mut() {
- if let NodeKind::Space { newlines: (1 ..) } = child.kind() {
- ahead_nontrivia.newline();
- }
- }
-
- // We look only for non spaces, non-semicolon and also
- // reject text that points to the special case for URL
- // evasion and line comments.
- if !child.kind().is_space()
- && child.kind() != &NodeKind::Semicolon
- && child.kind() != &NodeKind::Text('/'.into())
- && (ahead.is_none() || change.replaced.start > child_span.end)
- && !ahead.map_or(false, Ahead::is_compulsory)
- {
- ahead = Some(Ahead::new(pos, at_start, is_bounded(child.kind())));
- }
-
- at_start = next_at_start(child.kind(), at_start);
- }
- }
- SearchState::Inside(start) => {
- if child_span.end == change.replaced.end {
- search = SearchState::RequireNonTrivia(start);
- } else if child_span.end > change.replaced.end {
- search = SearchState::SpanFound(start, pos);
- }
- }
- SearchState::RequireNonTrivia(start) => {
- if !child.kind().is_trivia() {
- search = SearchState::SpanFound(start, pos);
- }
- }
- _ => unreachable!(),
- }
-
- offset += child.len();
-
- if search.done().is_some() {
- break;
- }
- }
-
- // If we were looking for a non-whitespace element and hit the end of
- // the file here, we instead use EOF as the end of the span.
- if let SearchState::RequireNonTrivia(start) = search {
- search = SearchState::SpanFound(start, NodePos {
- idx: node.children().len() - 1,
- offset: offset - node.children().last().unwrap().len(),
- })
- }
-
- if let SearchState::Contained(pos) = search {
- // Do not allow replacement of elements inside of constructs whose
- // opening and closing brackets look the same.
- let safe_inside = is_bounded(node.kind());
- let child = &mut node.children_mut()[pos.idx];
- let prev_len = child.len();
- let prev_descendants = child.descendants();
-
- if let Some(range) = match child {
- SyntaxNode::Inner(node) => try_reparse(
- change,
- Arc::make_mut(node),
- pos.offset,
- child_outermost,
- safe_inside,
- ),
- SyntaxNode::Leaf(_) => None,
- } {
- let new_len = child.len();
- let new_descendants = child.descendants();
- node.update_parent(prev_len, new_len, prev_descendants, new_descendants);
- return Some(range);
- }
-
- let superseded_span = pos.offset .. pos.offset + prev_len;
- let func: Option<ReparseMode> = match child.kind() {
- NodeKind::CodeBlock => Some(ReparseMode::Code),
- NodeKind::ContentBlock => Some(ReparseMode::Content),
- _ => None,
- };
-
- // Return if the element was reparsable on its own, otherwise try to
- // treat it as a markup element.
- if let Some(func) = func {
- if let Some(result) = replace(
- change,
- node,
- func,
- pos.idx .. pos.idx + 1,
- superseded_span,
- outermost,
- ) {
- return Some(result);
- }
- }
- }
-
- // Make sure this is a markup node and that we may replace. If so, save
- // the current indent.
- let min_indent = match node.kind() {
- NodeKind::Markup { min_indent } if safe_to_replace => *min_indent,
- _ => return None,
- };
-
- let (mut start, end) = search.done()?;
- if let Some(ahead) = ahead {
- if start.offset == change.replaced.start || ahead.is_compulsory() {
- start = ahead.pos;
- at_start = ahead.at_start;
- }
- } else {
- start = NodePos { idx: 0, offset: original_offset };
- }
-
- let superseded_span =
- start.offset .. end.offset + node.children().as_slice()[end.idx].len();
-
- replace(
- change,
- node,
- ReparseMode::MarkupElements { at_start, min_indent },
- start.idx .. end.idx + 1,
- superseded_span,
- outermost,
- )
-}
-
-/// Reparse the superseded nodes and replace them.
-fn replace(
- change: &Change,
- node: &mut InnerNode,
- mode: ReparseMode,
- superseded_idx: Range<usize>,
- superseded_span: Range<usize>,
- outermost: bool,
-) -> Option<Range<usize>> {
- let superseded_start = superseded_idx.start;
-
- let differential: isize =
- change.replacement_len as isize - change.replaced.len() as isize;
- let newborn_end = (superseded_span.end as isize + differential) as usize;
- let newborn_span = superseded_span.start .. newborn_end;
-
- let mut prefix = "";
- for (i, c) in change.text[.. newborn_span.start].char_indices().rev() {
- if is_newline(c) {
- break;
- }
- prefix = &change.text[i .. newborn_span.start];
- }
-
- let (newborns, terminated, amount) = match mode {
- ReparseMode::Code => reparse_code_block(
- &prefix,
- &change.text[newborn_span.start ..],
- newborn_span.len(),
- ),
- ReparseMode::Content => reparse_content_block(
- &prefix,
- &change.text[newborn_span.start ..],
- newborn_span.len(),
- ),
- ReparseMode::MarkupElements { at_start, min_indent } => reparse_markup_elements(
- &prefix,
- &change.text[newborn_span.start ..],
- newborn_span.len(),
- differential,
- &node.children().as_slice()[superseded_start ..],
- at_start,
- min_indent,
- ),
- }?;
-
- // Do not accept unclosed nodes if the old node wasn't at the right edge
- // of the tree.
- if !outermost && !terminated {
- return None;
- }
-
- node.replace_children(superseded_start .. superseded_start + amount, newborns)
- .ok()?;
-
- Some(newborn_span)
-}
-
-/// A description of a change.
-struct Change<'a> {
- /// The new source code, with the change applied.
- text: &'a str,
- /// Which range in the old source file was changed.
- replaced: Range<usize>,
- /// How many characters replaced the text in `replaced`.
- replacement_len: usize,
-}
-
-/// Encodes the state machine of the search for the nodes are pending for
-/// replacement.
-#[derive(Clone, Copy, Debug, PartialEq)]
-enum SearchState {
- /// Neither an end nor a start have been found as of now.
- /// The latest non-trivia child is continually saved.
- NoneFound,
- /// The search has concluded by finding a node that fully contains the
- /// modifications.
- Contained(NodePos),
- /// The search has found the start of the modified nodes.
- Inside(NodePos),
- /// The search has found the end of the modified nodes but the change
- /// touched its boundries so another non-trivia node is needed.
- RequireNonTrivia(NodePos),
- /// The search has concluded by finding a start and an end index for nodes
- /// with a pending reparse.
- SpanFound(NodePos, NodePos),
-}
-
-impl Default for SearchState {
- fn default() -> Self {
- Self::NoneFound
- }
-}
-
-impl SearchState {
- fn done(self) -> Option<(NodePos, NodePos)> {
- match self {
- Self::NoneFound => None,
- Self::Contained(s) => Some((s, s)),
- Self::Inside(_) => None,
- Self::RequireNonTrivia(_) => None,
- Self::SpanFound(s, e) => Some((s, e)),
- }
- }
-}
-
-/// The position of a syntax node.
-#[derive(Clone, Copy, Debug, PartialEq)]
-struct NodePos {
- /// The index in the parent node.
- idx: usize,
- /// The byte offset in the string.
- offset: usize,
-}
-
-/// An ahead node with an index and whether it is `at_start`.
-#[derive(Clone, Copy, Debug, PartialEq)]
-struct Ahead {
- /// The position of the node.
- pos: NodePos,
- /// The `at_start` before this node.
- at_start: bool,
- /// The kind of ahead node.
- kind: AheadKind,
-}
-
-/// The kind of ahead node.
-#[derive(Clone, Copy, Debug, PartialEq)]
-enum AheadKind {
- /// A normal non-trivia child has been found.
- Normal,
- /// An unbounded child has been found. The boolean indicates whether it was
- /// on the current line, in which case adding it to the reparsing range is
- /// compulsory.
- Unbounded(bool),
-}
-
-impl Ahead {
- fn new(pos: NodePos, at_start: bool, bounded: bool) -> Self {
- Self {
- pos,
- at_start,
- kind: if bounded {
- AheadKind::Normal
- } else {
- AheadKind::Unbounded(true)
- },
- }
- }
-
- fn newline(&mut self) {
- if let AheadKind::Unbounded(current_line) = &mut self.kind {
- *current_line = false;
- }
- }
-
- fn is_compulsory(self) -> bool {
- matches!(self.kind, AheadKind::Unbounded(true))
- }
-}
-
-/// Which reparse function to choose for a span of elements.
-#[derive(Clone, Copy, Debug, PartialEq)]
-enum ReparseMode {
- /// Reparse a code block, including its braces.
- Code,
- /// Reparse a content block, including its square brackets.
- Content,
- /// Reparse elements of the markup. Also specified the initial `at_start`
- /// state for the reparse and the minimum indent of the reparsed nodes.
- MarkupElements { at_start: bool, min_indent: usize },
-}
-
-/// Whether changes _inside_ this node are safely encapsulated, so that only
-/// this node must be reparsed.
-fn is_bounded(kind: &NodeKind) -> bool {
- match kind {
- NodeKind::CodeBlock
- | NodeKind::ContentBlock
- | NodeKind::Linebreak
- | NodeKind::SmartQuote { .. }
- | NodeKind::BlockComment
- | NodeKind::Space { .. }
- | NodeKind::Escape(_)
- | NodeKind::Shorthand(_) => true,
- _ => false,
- }
-}
-
-/// Whether `at_start` would still be true after this node given the
-/// previous value of the property.
-fn next_at_start(kind: &NodeKind, prev: bool) -> bool {
- match kind {
- NodeKind::Space { newlines: (1 ..) } => true,
- NodeKind::Space { .. } | NodeKind::LineComment | NodeKind::BlockComment => prev,
- _ => false,
- }
-}
-
-#[cfg(test)]
-#[rustfmt::skip]
-mod tests {
- use super::*;
- use crate::parse::parse;
- use crate::parse::tests::check;
- use crate::source::Source;
-
- #[track_caller]
- fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) {
- let mut source = Source::detached(prev);
- let range = source.edit(range, with);
- check(source.text(), source.root(), &parse(source.text()));
- assert_eq!(range, goal);
- }
-
- #[test]
- fn test_parse_incremental_simple_replacements() {
- test("hello world", 7 .. 12, "walkers", 0 .. 14);
- test("some content", 0..12, "", 0..0);
- test("", 0..0, "do it", 0..5);
- test("a d e", 1 .. 3, " b c d", 0 .. 9);
- test("*~ *", 2..2, "*", 0..5);
- test("_1_\n2a\n3", 5..5, "4", 4..7);
- test("_1_\n2a\n3~", 8..8, "4", 4..10);
- test("_1_ 2 3a\n4", 7..7, "5", 0..9);
- test("* {1+2} *", 5..6, "3", 2..7);
- test("a #f() e", 1 .. 6, " b c d", 0 .. 9);
- test("a\nb\nc\nd\ne\n", 5 .. 5, "c", 2 .. 7);
- test("a\n\nb\n\nc\n\nd\n\ne\n", 7 .. 7, "c", 3 .. 10);
- test("a\nb\nc *hel a b lo* d\nd\ne", 13..13, "c ", 4..20);
- test("~~ {a} ~~", 4 .. 5, "b", 3 .. 6);
- test("{(0, 1, 2)}", 5 .. 6, "11pt", 0..14);
- test("\n= A heading", 4 .. 4, "n evocative", 0 .. 23);
- test("for~your~thing", 9 .. 9, "a", 0 .. 15);
- test("a your thing a", 6 .. 7, "a", 0 .. 14);
- test("{call(); abc}", 7 .. 7, "[]", 0 .. 15);
- test("#call() abc", 7 .. 7, "[]", 0 .. 10);
- test("hi[\n- item\n- item 2\n - item 3]", 11 .. 11, " ", 2 .. 35);
- test("hi\n- item\nno item\n - item 3", 10 .. 10, "- ", 3..19);
- test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 16 .. 20, "none", 0..99);
- test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 33 .. 42, "[_gronk_]", 33..42);
- test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 34 .. 41, "_bar_", 33 .. 40);
- test("{let i=1; for x in range(5) {i}}", 6 .. 6, " ", 0 .. 33);
- test("{let i=1; for x in range(5) {i}}", 13 .. 14, " ", 0 .. 33);
- test("hello~~{x}", 7 .. 10, "#f()", 0 .. 11);
- test("this~is -- in my opinion -- spectacular", 8 .. 10, "---", 0 .. 25);
- test("understanding `code` is complicated", 15 .. 15, "C ", 0 .. 22);
- test("{ let x = g() }", 10 .. 12, "f(54", 0 .. 17);
- test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 18);
- test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18);
- test("#for", 4 .. 4, "//", 0 .. 6);
- test("#show a: f as b..", 16..16, "c", 0..18);
- test("a\n#let \nb", 7 .. 7, "i", 2 .. 9);
- test("a\n#for i \nb", 9 .. 9, "in", 2 .. 12);
- test("a~https://fun/html", 13..14, "n", 0..18);
- }
-
- #[test]
- fn test_parse_incremental_whitespace_invariants() {
- test("hello \\ world", 7 .. 8, "a ", 0 .. 14);
- test("hello \\ world", 7 .. 8, " a", 0 .. 14);
- test("x = y", 1 .. 1, " + y", 0 .. 6);
- test("x = y", 1 .. 1, " + y\n", 0 .. 7);
- test("abc\n= a heading\njoke", 3 .. 4, "\nmore\n\n", 0 .. 21);
- test("abc\n= a heading\njoke", 3 .. 4, "\nnot ", 0 .. 19);
- test("#let x = (1, 2 + ;~ Five\r\n\r", 20 .. 23, "2.", 0 .. 23);
- test("hey #myfriend", 4 .. 4, "\\", 0 .. 14);
- test("hey #myfriend", 4 .. 4, "\\", 0 .. 6);
- test("= foo\nbar\n - a\n - b", 6 .. 9, "", 0 .. 11);
- test("= foo\n bar\n baz", 6 .. 8, "", 0 .. 9);
- test(" // hi", 1 .. 1, " ", 0 .. 7);
- test("- \nA", 2..3, "", 0..3);
- }
-
- #[test]
- fn test_parse_incremental_type_invariants() {
- test("a #for x in array {x}", 18 .. 21, "[#x]", 0 .. 22);
- test("a #let x = 1 {5}", 3 .. 6, "if", 0 .. 11);
- test("a {let x = 1 {5}} b", 3 .. 6, "if", 2 .. 16);
- test("#let x = 1 {5}", 4 .. 4, " if", 0 .. 13);
- test("{let x = 1 {5}}", 4 .. 4, " if", 0 .. 18);
- test("a // b c #f()", 3 .. 4, "", 0 .. 12);
- test("{\nf()\n//g(a)\n}", 6 .. 8, "", 0 .. 12);
- test("a{\nf()\n//g(a)\n}b", 7 .. 9, "", 1 .. 13);
- test("a #while x {\n g(x) \n} b", 11 .. 11, "//", 0 .. 26);
- test("{(1, 2)}", 1 .. 1, "while ", 0 .. 14);
- test("a b c", 1 .. 1, "{[}", 0 .. 8);
- }
-
- #[test]
- fn test_parse_incremental_wrongly_or_unclosed_things() {
- test(r#"{"hi"}"#, 4 .. 5, "c", 0 .. 6);
- test(r"this \u{abcd}", 8 .. 9, "", 0 .. 12);
- test(r"this \u{abcd} that", 12 .. 13, "", 0 .. 17);
- test(r"{{let x = z}; a = 1} b", 6 .. 6, "//", 0 .. 24);
- test("a b c", 1 .. 1, " /* letters */", 0 .. 19);
- test("a b c", 1 .. 1, " /* letters", 0 .. 16);
- test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters */", 0 .. 41);
- test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters", 0 .. 38);
- test("~~~~", 2 .. 2, "[]", 0 .. 5);
- test("a[]b", 2 .. 2, "{", 1 .. 4);
- test("[hello]", 2 .. 3, "]", 0 .. 7);
- test("{a}", 1 .. 2, "b", 0 .. 3);
- test("{ a; b; c }", 5 .. 6, "[}]", 0 .. 13);
- test("#a()\n~", 3..4, "{}", 0..7);
- test("[]\n~", 1..2, "#if i==0 {true}", 0..18);
- }
-}
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
deleted file mode 100644
index ac8ec6eb..00000000
--- a/src/parse/mod.rs
+++ /dev/null
@@ -1,1166 +0,0 @@
-//! Parsing and tokenization.
-
-mod incremental;
-mod parser;
-mod resolve;
-mod tokens;
-
-pub use incremental::*;
-pub use parser::*;
-pub use tokens::*;
-
-use std::collections::HashSet;
-
-use crate::syntax::ast::{Assoc, BinOp, UnOp};
-use crate::syntax::{ErrorPos, NodeKind, SyntaxNode};
-use crate::util::EcoString;
-
-/// Parse a source file.
-pub fn parse(text: &str) -> SyntaxNode {
- let mut p = Parser::new(text, TokenMode::Markup);
- markup(&mut p, true);
- p.finish().into_iter().next().unwrap()
-}
-
-/// Parse code directly, only used for syntax highlighting.
-pub fn parse_code(text: &str) -> SyntaxNode {
- let mut p = Parser::new(text, TokenMode::Code);
- p.perform(NodeKind::CodeBlock, code);
- p.finish().into_iter().next().unwrap()
-}
-
-/// Reparse a code block.
-///
-/// Returns `Some` if all of the input was consumed.
-fn reparse_code_block(
- prefix: &str,
- text: &str,
- end_pos: usize,
-) -> Option<(Vec<SyntaxNode>, bool, usize)> {
- let mut p = Parser::with_prefix(prefix, text, TokenMode::Code);
- if !p.at(NodeKind::LeftBrace) {
- return None;
- }
-
- code_block(&mut p);
-
- let (mut node, terminated) = p.consume()?;
- let first = node.remove(0);
- if first.len() != end_pos {
- return None;
- }
-
- Some((vec![first], terminated, 1))
-}
-
-/// Reparse a content block.
-///
-/// Returns `Some` if all of the input was consumed.
-fn reparse_content_block(
- prefix: &str,
- text: &str,
- end_pos: usize,
-) -> Option<(Vec<SyntaxNode>, bool, usize)> {
- let mut p = Parser::with_prefix(prefix, text, TokenMode::Code);
- if !p.at(NodeKind::LeftBracket) {
- return None;
- }
-
- content_block(&mut p);
-
- let (mut node, terminated) = p.consume()?;
- let first = node.remove(0);
- if first.len() != end_pos {
- return None;
- }
-
- Some((vec![first], terminated, 1))
-}
-
-/// Reparse a sequence markup elements without the topmost node.
-///
-/// Returns `Some` if all of the input was consumed.
-fn reparse_markup_elements(
- prefix: &str,
- text: &str,
- end_pos: usize,
- differential: isize,
- reference: &[SyntaxNode],
- mut at_start: bool,
- min_indent: usize,
-) -> Option<(Vec<SyntaxNode>, bool, usize)> {
- let mut p = Parser::with_prefix(prefix, text, TokenMode::Markup);
-
- let mut node: Option<&SyntaxNode> = None;
- let mut iter = reference.iter();
- let mut offset = differential;
- let mut replaced = 0;
- let mut stopped = false;
-
- 'outer: while !p.eof() {
- if let Some(NodeKind::Space { newlines: (1 ..) }) = p.peek() {
- if p.column(p.current_end()) < min_indent {
- return None;
- }
- }
-
- markup_node(&mut p, &mut at_start);
-
- if p.prev_end() <= end_pos {
- continue;
- }
-
- let recent = p.marker().before(&p).unwrap();
- let recent_start = p.prev_end() - recent.len();
-
- while offset <= recent_start as isize {
- if let Some(node) = node {
- // The nodes are equal, at the same position and have the
- // same content. The parsing trees have converged again, so
- // the reparse may stop here.
- if offset == recent_start as isize && node == recent {
- replaced -= 1;
- stopped = true;
- break 'outer;
- }
- }
-
- if let Some(node) = node {
- offset += node.len() as isize;
- }
-
- node = iter.next();
- if node.is_none() {
- break;
- }
-
- replaced += 1;
- }
- }
-
- if p.eof() && !stopped {
- replaced = reference.len();
- }
-
- let (mut res, terminated) = p.consume()?;
- if stopped {
- res.pop().unwrap();
- }
-
- Some((res, terminated, replaced))
-}
-
-/// Parse markup.
-///
-/// If `at_start` is true, things like headings that may only appear at the
-/// beginning of a line or content block are initially allowed.
-fn markup(p: &mut Parser, mut at_start: bool) {
- p.perform(NodeKind::Markup { min_indent: 0 }, |p| {
- while !p.eof() {
- markup_node(p, &mut at_start);
- }
- });
-}
-
-/// Parse markup that stays right of the given `column`.
-fn markup_indented(p: &mut Parser, min_indent: usize) {
- p.eat_while(|t| match t {
- NodeKind::Space { newlines } => *newlines == 0,
- NodeKind::LineComment | NodeKind::BlockComment => true,
- _ => false,
- });
-
- let marker = p.marker();
- let mut at_start = false;
-
- while !p.eof() {
- match p.peek() {
- Some(NodeKind::Space { newlines: (1 ..) })
- if p.column(p.current_end()) < min_indent =>
- {
- break;
- }
- _ => {}
- }
-
- markup_node(p, &mut at_start);
- }
-
- marker.end(p, NodeKind::Markup { min_indent });
-}
-
-/// Parse a line of markup that can prematurely end if `f` returns true.
-fn markup_line<F>(p: &mut Parser, mut f: F)
-where
- F: FnMut(&NodeKind) -> bool,
-{
- p.eat_while(|t| match t {
- NodeKind::Space { newlines } => *newlines == 0,
- NodeKind::LineComment | NodeKind::BlockComment => true,
- _ => false,
- });
-
- p.perform(NodeKind::Markup { min_indent: usize::MAX }, |p| {
- let mut at_start = false;
- while let Some(kind) = p.peek() {
- if let NodeKind::Space { newlines: (1 ..) } = kind {
- break;
- }
-
- if f(kind) {
- break;
- }
-
- markup_node(p, &mut at_start);
- }
- });
-}
-
-/// Parse a markup node.
-fn markup_node(p: &mut Parser, at_start: &mut bool) {
- let token = match p.peek() {
- Some(t) => t,
- None => return,
- };
-
- match token {
- // Whitespace.
- NodeKind::Space { newlines } => {
- *at_start |= *newlines > 0;
- p.eat();
- return;
- }
-
- // Comments.
- NodeKind::LineComment | NodeKind::BlockComment => {
- p.eat();
- return;
- }
-
- // Text and markup.
- NodeKind::Text(_)
- | NodeKind::Linebreak
- | NodeKind::SmartQuote { .. }
- | NodeKind::Escape(_)
- | NodeKind::Shorthand(_)
- | NodeKind::Link(_)
- | NodeKind::Raw(_)
- | NodeKind::Label(_)
- | NodeKind::Ref(_) => p.eat(),
-
- // Math.
- NodeKind::Dollar => math(p),
-
- // Strong, emph, heading.
- NodeKind::Star => strong(p),
- NodeKind::Underscore => emph(p),
- NodeKind::Eq => heading(p, *at_start),
-
- // Lists.
- NodeKind::Minus => list_node(p, *at_start),
- NodeKind::Plus | NodeKind::EnumNumbering(_) => enum_node(p, *at_start),
- NodeKind::Slash => {
- desc_node(p, *at_start).ok();
- }
- NodeKind::Colon => {
- let marker = p.marker();
- p.eat();
- marker.convert(p, NodeKind::Text(':'.into()));
- }
-
- // Hashtag + keyword / identifier.
- NodeKind::Ident(_)
- | NodeKind::Let
- | NodeKind::Set
- | NodeKind::Show
- | NodeKind::Wrap
- | NodeKind::If
- | NodeKind::While
- | NodeKind::For
- | NodeKind::Import
- | NodeKind::Include
- | NodeKind::Break
- | NodeKind::Continue
- | NodeKind::Return => markup_expr(p),
-
- // Code and content block.
- NodeKind::LeftBrace => code_block(p),
- NodeKind::LeftBracket => content_block(p),
-
- NodeKind::Error(_, _) => p.eat(),
- _ => p.unexpected(),
- };
-
- *at_start = false;
-}
-
-/// Parse strong content.
-fn strong(p: &mut Parser) {
- p.perform(NodeKind::Strong, |p| {
- p.start_group(Group::Strong);
- markup(p, false);
- p.end_group();
- })
-}
-
-/// Parse emphasized content.
-fn emph(p: &mut Parser) {
- p.perform(NodeKind::Emph, |p| {
- p.start_group(Group::Emph);
- markup(p, false);
- p.end_group();
- })
-}
-
-/// Parse a heading.
-fn heading(p: &mut Parser, at_start: bool) {
- let marker = p.marker();
- let current_start = p.current_start();
- p.assert(NodeKind::Eq);
- while p.eat_if(NodeKind::Eq) {}
-
- if at_start && p.peek().map_or(true, |kind| kind.is_space()) {
- p.eat_while(|kind| *kind == NodeKind::Space { newlines: 0 });
- markup_line(p, |kind| matches!(kind, NodeKind::Label(_)));
- marker.end(p, NodeKind::Heading);
- } else {
- let text = p.get(current_start .. p.prev_end()).into();
- marker.convert(p, NodeKind::Text(text));
- }
-}
-
-/// Parse a single list item.
-fn list_node(p: &mut Parser, at_start: bool) {
- let marker = p.marker();
- let text: EcoString = p.peek_src().into();
- p.assert(NodeKind::Minus);
-
- let min_indent = p.column(p.prev_end());
- if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() {
- markup_indented(p, min_indent);
- marker.end(p, NodeKind::ListItem);
- } else {
- marker.convert(p, NodeKind::Text(text));
- }
-}
-
-/// Parse a single enum item.
-fn enum_node(p: &mut Parser, at_start: bool) {
- let marker = p.marker();
- let text: EcoString = p.peek_src().into();
- p.eat();
-
- let min_indent = p.column(p.prev_end());
- if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() {
- markup_indented(p, min_indent);
- marker.end(p, NodeKind::EnumItem);
- } else {
- marker.convert(p, NodeKind::Text(text));
- }
-}
-
-/// Parse a single description list item.
-fn desc_node(p: &mut Parser, at_start: bool) -> ParseResult {
- let marker = p.marker();
- let text: EcoString = p.peek_src().into();
- p.eat();
-
- let min_indent = p.column(p.prev_end());
- if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() {
- markup_line(p, |node| matches!(node, NodeKind::Colon));
- p.expect(NodeKind::Colon)?;
- markup_indented(p, min_indent);
- marker.end(p, NodeKind::DescItem);
- } else {
- marker.convert(p, NodeKind::Text(text));
- }
-
- Ok(())
-}
-
-/// Parse an expression within a markup mode.
-fn markup_expr(p: &mut Parser) {
- // Does the expression need termination or can content follow directly?
- let stmt = matches!(
- p.peek(),
- Some(
- NodeKind::Let
- | NodeKind::Set
- | NodeKind::Show
- | NodeKind::Wrap
- | NodeKind::Import
- | NodeKind::Include
- )
- );
-
- p.start_group(Group::Expr);
- let res = expr_prec(p, true, 0);
- if stmt && res.is_ok() && !p.eof() {
- p.expected("semicolon or line break");
- }
- p.end_group();
-}
-
-/// Parse math.
-fn math(p: &mut Parser) {
- p.perform(NodeKind::Math, |p| {
- p.start_group(Group::Math);
- while !p.eof() {
- math_node(p);
- }
- p.end_group();
- });
-}
-
-/// Parse a math node.
-fn math_node(p: &mut Parser) {
- math_node_prec(p, 0, None)
-}
-
-/// Parse a math node with operators having at least the minimum precedence.
-fn math_node_prec(p: &mut Parser, min_prec: usize, stop: Option<NodeKind>) {
- let marker = p.marker();
- math_primary(p);
-
- loop {
- let (kind, mut prec, assoc, stop) = match p.peek() {
- v if v == stop.as_ref() => break,
- Some(NodeKind::Underscore) => {
- (NodeKind::Script, 2, Assoc::Right, Some(NodeKind::Hat))
- }
- Some(NodeKind::Hat) => (
- NodeKind::Script,
- 2,
- Assoc::Right,
- Some(NodeKind::Underscore),
- ),
- Some(NodeKind::Slash) => (NodeKind::Frac, 1, Assoc::Left, None),
- _ => break,
- };
-
- if prec < min_prec {
- break;
- }
-
- match assoc {
- Assoc::Left => prec += 1,
- Assoc::Right => {}
- }
-
- p.eat();
- math_node_prec(p, prec, stop);
-
- // Allow up to two different scripts. We do not risk encountering the
- // previous script kind again here due to right-associativity.
- if p.eat_if(NodeKind::Underscore) || p.eat_if(NodeKind::Hat) {
- math_node_prec(p, prec, None);
- }
-
- marker.end(p, kind);
- }
-}
-
-/// Parse a primary math node.
-fn math_primary(p: &mut Parser) {
- let token = match p.peek() {
- Some(t) => t,
- None => return,
- };
-
- match token {
- // Spaces, atoms and expressions.
- NodeKind::Space { .. }
- | NodeKind::Linebreak
- | NodeKind::Escape(_)
- | NodeKind::Atom(_)
- | NodeKind::Ident(_) => p.eat(),
-
- // Groups.
- NodeKind::LeftParen => group(p, Group::Paren, '(', ')'),
- NodeKind::LeftBracket => group(p, Group::Bracket, '[', ']'),
- NodeKind::LeftBrace => group(p, Group::Brace, '{', '}'),
-
- // Alignment indactor.
- NodeKind::Amp => align(p),
-
- _ => p.unexpected(),
- }
-}
-
-/// Parse grouped math.
-fn group(p: &mut Parser, group: Group, l: char, r: char) {
- p.perform(NodeKind::Math, |p| {
- let marker = p.marker();
- p.start_group(group);
- marker.convert(p, NodeKind::Atom(l.into()));
- while !p.eof() {
- math_node(p);
- }
- let marker = p.marker();
- p.end_group();
- marker.convert(p, NodeKind::Atom(r.into()));
- })
-}
-
-/// Parse an alignment indicator.
-fn align(p: &mut Parser) {
- p.perform(NodeKind::Align, |p| {
- p.assert(NodeKind::Amp);
- while p.eat_if(NodeKind::Amp) {}
- })
-}
-
-/// Parse an expression.
-fn expr(p: &mut Parser) -> ParseResult {
- expr_prec(p, false, 0)
-}
-
-/// Parse an expression with operators having at least the minimum precedence.
-///
-/// If `atomic` is true, this does not parse binary operations and arrow
-/// functions, which is exactly what we want in a shorthand expression directly
-/// in markup.
-///
-/// Stops parsing at operations with lower precedence than `min_prec`,
-fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult {
- let marker = p.marker();
-
- // Start the unary expression.
- match p.peek().and_then(UnOp::from_token) {
- Some(op) if !atomic => {
- p.eat();
- let prec = op.precedence();
- expr_prec(p, atomic, prec)?;
- marker.end(p, NodeKind::Unary);
- }
- _ => primary(p, atomic)?,
- };
-
- loop {
- // Parenthesis or bracket means this is a function call.
- if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() {
- marker.perform(p, NodeKind::FuncCall, args)?;
- continue;
- }
-
- if atomic {
- break;
- }
-
- // Method call or field access.
- if p.eat_if(NodeKind::Dot) {
- ident(p)?;
- if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() {
- marker.perform(p, NodeKind::MethodCall, args)?;
- } else {
- marker.end(p, NodeKind::FieldAccess);
- }
- continue;
- }
-
- let op = if p.eat_if(NodeKind::Not) {
- if p.at(NodeKind::In) {
- BinOp::NotIn
- } else {
- p.expected("keyword `in`");
- return Err(ParseError);
- }
- } else {
- match p.peek().and_then(BinOp::from_token) {
- Some(binop) => binop,
- None => break,
- }
- };
-
- let mut prec = op.precedence();
- if prec < min_prec {
- break;
- }
-
- p.eat();
-
- match op.assoc() {
- Assoc::Left => prec += 1,
- Assoc::Right => {}
- }
-
- marker.perform(p, NodeKind::Binary, |p| expr_prec(p, atomic, prec))?;
- }
-
- Ok(())
-}
-
-/// Parse a primary expression.
-fn primary(p: &mut Parser, atomic: bool) -> ParseResult {
- if literal(p) {
- return Ok(());
- }
-
- match p.peek() {
- // Things that start with an identifier.
- Some(NodeKind::Ident(_)) => {
- let marker = p.marker();
- p.eat();
-
- // Arrow means this is a closure's lone parameter.
- if !atomic && p.at(NodeKind::Arrow) {
- marker.end(p, NodeKind::Params);
- p.assert(NodeKind::Arrow);
- marker.perform(p, NodeKind::Closure, expr)
- } else {
- Ok(())
- }
- }
-
- // Structures.
- Some(NodeKind::LeftParen) => parenthesized(p, atomic),
- Some(NodeKind::LeftBrace) => Ok(code_block(p)),
- Some(NodeKind::LeftBracket) => Ok(content_block(p)),
-
- // Keywords.
- Some(NodeKind::Let) => let_expr(p),
- Some(NodeKind::Set) => set_expr(p),
- Some(NodeKind::Show) => show_expr(p),
- Some(NodeKind::Wrap) => wrap_expr(p),
- Some(NodeKind::If) => if_expr(p),
- Some(NodeKind::While) => while_expr(p),
- Some(NodeKind::For) => for_expr(p),
- Some(NodeKind::Import) => import_expr(p),
- Some(NodeKind::Include) => include_expr(p),
- Some(NodeKind::Break) => break_expr(p),
- Some(NodeKind::Continue) => continue_expr(p),
- Some(NodeKind::Return) => return_expr(p),
-
- Some(NodeKind::Error(_, _)) => {
- p.eat();
- Err(ParseError)
- }
-
- // Nothing.
- _ => {
- p.expected_found("expression");
- Err(ParseError)
- }
- }
-}
-
-/// Parse a literal.
-fn literal(p: &mut Parser) -> bool {
- match p.peek() {
- // Basic values.
- Some(
- NodeKind::None
- | NodeKind::Auto
- | NodeKind::Int(_)
- | NodeKind::Float(_)
- | NodeKind::Bool(_)
- | NodeKind::Numeric(_, _)
- | NodeKind::Str(_),
- ) => {
- p.eat();
- true
- }
-
- _ => false,
- }
-}
-
-/// Parse an identifier.
-fn ident(p: &mut Parser) -> ParseResult {
- match p.peek() {
- Some(NodeKind::Ident(_)) => {
- p.eat();
- Ok(())
- }
- _ => {
- p.expected_found("identifier");
- Err(ParseError)
- }
- }
-}
-
-/// Parse something that starts with a parenthesis, which can be either of:
-/// - Array literal
-/// - Dictionary literal
-/// - Parenthesized expression
-/// - Parameter list of closure expression
-fn parenthesized(p: &mut Parser, atomic: bool) -> ParseResult {
- let marker = p.marker();
-
- p.start_group(Group::Paren);
- let colon = p.eat_if(NodeKind::Colon);
- let kind = collection(p, true).0;
- p.end_group();
-
- // Leading colon makes this a dictionary.
- if colon {
- dict(p, marker);
- return Ok(());
- }
-
- // Arrow means this is a closure's parameter list.
- if !atomic && p.at(NodeKind::Arrow) {
- params(p, marker);
- p.assert(NodeKind::Arrow);
- return marker.perform(p, NodeKind::Closure, expr);
- }
-
- // Transform into the identified collection.
- match kind {
- CollectionKind::Group => marker.end(p, NodeKind::Parenthesized),
- CollectionKind::Positional => array(p, marker),
- CollectionKind::Named => dict(p, marker),
- }
-
- Ok(())
-}
-
-/// The type of a collection.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-enum CollectionKind {
- /// The collection is only one item and has no comma.
- Group,
- /// The collection starts with a positional item and has multiple items or a
- /// trailing comma.
- Positional,
- /// The collection starts with a colon or named item.
- Named,
-}
-
-/// Parse a collection.
-///
-/// Returns the length of the collection and whether the literal contained any
-/// commas.
-fn collection(p: &mut Parser, keyed: bool) -> (CollectionKind, usize) {
- let mut kind = None;
- let mut items = 0;
- let mut can_group = true;
- let mut missing_coma: Option<Marker> = None;
-
- while !p.eof() {
- if let Ok(item_kind) = item(p, keyed) {
- match item_kind {
- NodeKind::Spread => can_group = false,
- NodeKind::Named if kind.is_none() => {
- kind = Some(CollectionKind::Named);
- can_group = false;
- }
- _ if kind.is_none() => {
- kind = Some(CollectionKind::Positional);
- }
- _ => {}
- }
-
- items += 1;
-
- if let Some(marker) = missing_coma.take() {
- p.expected_at(marker, "comma");
- }
-
- if p.eof() {
- break;
- }
-
- if p.eat_if(NodeKind::Comma) {
- can_group = false;
- } else {
- missing_coma = Some(p.trivia_start());
- }
- } else {
- p.eat_if(NodeKind::Comma);
- kind = Some(CollectionKind::Group);
- }
- }
-
- let kind = if can_group && items == 1 {
- CollectionKind::Group
- } else {
- kind.unwrap_or(CollectionKind::Positional)
- };
-
- (kind, items)
-}
-
-/// Parse an expression or a named pair, returning whether it's a spread or a
-/// named pair.
-fn item(p: &mut Parser, keyed: bool) -> ParseResult<NodeKind> {
- let marker = p.marker();
- if p.eat_if(NodeKind::Dots) {
- marker.perform(p, NodeKind::Spread, expr)?;
- return Ok(NodeKind::Spread);
- }
-
- expr(p)?;
-
- if p.at(NodeKind::Colon) {
- match marker.after(p).map(|c| c.kind()) {
- Some(NodeKind::Ident(_)) => {
- p.eat();
- marker.perform(p, NodeKind::Named, expr)?;
- }
- Some(NodeKind::Str(_)) if keyed => {
- p.eat();
- marker.perform(p, NodeKind::Keyed, expr)?;
- }
- kind => {
- let mut msg = EcoString::from("expected identifier");
- if keyed {
- msg.push_str(" or string");
- }
- if let Some(kind) = kind {
- msg.push_str(", found ");
- msg.push_str(kind.name());
- }
- let error = NodeKind::Error(ErrorPos::Full, msg);
- marker.end(p, error);
- p.eat();
- marker.perform(p, NodeKind::Named, expr).ok();
- return Err(ParseError);
- }
- }
-
- Ok(NodeKind::Named)
- } else {
- Ok(NodeKind::None)
- }
-}
-
-/// Convert a collection into an array, producing errors for anything other than
-/// expressions.
-fn array(p: &mut Parser, marker: Marker) {
- marker.filter_children(p, |x| match x.kind() {
- NodeKind::Named | NodeKind::Keyed => Err("expected expression"),
- _ => Ok(()),
- });
- marker.end(p, NodeKind::Array);
-}
-
-/// Convert a collection into a dictionary, producing errors for anything other
-/// than named and keyed pairs.
-fn dict(p: &mut Parser, marker: Marker) {
- let mut used = HashSet::new();
- marker.filter_children(p, |x| match x.kind() {
- kind if kind.is_paren() => Ok(()),
- NodeKind::Named | NodeKind::Keyed => {
- if let Some(NodeKind::Ident(key) | NodeKind::Str(key)) =
- x.children().next().map(|child| child.kind())
- {
- if !used.insert(key.clone()) {
- return Err("pair has duplicate key");
- }
- }
- Ok(())
- }
- NodeKind::Spread | NodeKind::Comma | NodeKind::Colon => Ok(()),
- _ => Err("expected named or keyed pair"),
- });
- marker.end(p, NodeKind::Dict);
-}
-
-/// Convert a collection into a list of parameters, producing errors for
-/// anything other than identifiers, spread operations and named pairs.
-fn params(p: &mut Parser, marker: Marker) {
- marker.filter_children(p, |x| match x.kind() {
- kind if kind.is_paren() => Ok(()),
- NodeKind::Named | NodeKind::Ident(_) | NodeKind::Comma => Ok(()),
- NodeKind::Spread
- if matches!(
- x.children().last().map(|child| child.kind()),
- Some(&NodeKind::Ident(_))
- ) =>
- {
- Ok(())
- }
- _ => Err("expected identifier, named pair or argument sink"),
- });
- marker.end(p, NodeKind::Params);
-}
-
-/// Parse a code block: `{...}`.
-fn code_block(p: &mut Parser) {
- p.perform(NodeKind::CodeBlock, |p| {
- p.start_group(Group::Brace);
- code(p);
- p.end_group();
- });
-}
-
-/// Parse expressions.
-fn code(p: &mut Parser) {
- while !p.eof() {
- p.start_group(Group::Expr);
- if expr(p).is_ok() && !p.eof() {
- p.expected("semicolon or line break");
- }
- p.end_group();
-
- // Forcefully skip over newlines since the group's contents can't.
- p.eat_while(NodeKind::is_space);
- }
-}
-
-/// Parse a content block: `[...]`.
-fn content_block(p: &mut Parser) {
- p.perform(NodeKind::ContentBlock, |p| {
- p.start_group(Group::Bracket);
- markup(p, true);
- p.end_group();
- });
-}
-
-/// Parse the arguments to a function call.
-fn args(p: &mut Parser) -> ParseResult {
- match p.peek_direct() {
- Some(NodeKind::LeftParen) => {}
- Some(NodeKind::LeftBracket) => {}
- _ => {
- p.expected_found("argument list");
- return Err(ParseError);
- }
- }
-
- p.perform(NodeKind::Args, |p| {
- if p.at(NodeKind::LeftParen) {
- let marker = p.marker();
- p.start_group(Group::Paren);
- collection(p, false);
- p.end_group();
-
- let mut used = HashSet::new();
- marker.filter_children(p, |x| match x.kind() {
- NodeKind::Named => {
- if let Some(NodeKind::Ident(ident)) =
- x.children().next().map(|child| child.kind())
- {
- if !used.insert(ident.clone()) {
- return Err("duplicate argument");
- }
- }
- Ok(())
- }
- _ => Ok(()),
- });
- }
-
- while p.peek_direct() == Some(&NodeKind::LeftBracket) {
- content_block(p);
- }
- });
-
- Ok(())
-}
-
-/// Parse a let expression.
-fn let_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::LetBinding, |p| {
- p.assert(NodeKind::Let);
-
- let marker = p.marker();
- ident(p)?;
-
- // If a parenthesis follows, this is a function definition.
- let has_params = p.peek_direct() == Some(&NodeKind::LeftParen);
- if has_params {
- let marker = p.marker();
- p.start_group(Group::Paren);
- collection(p, false);
- p.end_group();
- params(p, marker);
- }
-
- if p.eat_if(NodeKind::Eq) {
- expr(p)?;
- } else if has_params {
- // Function definitions must have a body.
- p.expected("body");
- }
-
- // Rewrite into a closure expression if it's a function definition.
- if has_params {
- marker.end(p, NodeKind::Closure);
- }
-
- Ok(())
- })
-}
-
-/// Parse a set expression.
-fn set_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::SetRule, |p| {
- p.assert(NodeKind::Set);
- ident(p)?;
- args(p)
- })
-}
-
-/// Parse a show expression.
-fn show_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ShowRule, |p| {
- p.assert(NodeKind::Show);
- let marker = p.marker();
- expr(p)?;
- if p.eat_if(NodeKind::Colon) {
- marker.filter_children(p, |child| match child.kind() {
- NodeKind::Ident(_) | NodeKind::Colon => Ok(()),
- _ => Err("expected identifier"),
- });
- expr(p)?;
- }
- p.expect(NodeKind::As)?;
- expr(p)
- })
-}
-
-/// Parse a wrap expression.
-fn wrap_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::WrapRule, |p| {
- p.assert(NodeKind::Wrap);
- ident(p)?;
- p.expect(NodeKind::In)?;
- expr(p)
- })
-}
-
-/// Parse an if-else expresion.
-fn if_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::Conditional, |p| {
- p.assert(NodeKind::If);
-
- expr(p)?;
- body(p)?;
-
- if p.eat_if(NodeKind::Else) {
- if p.at(NodeKind::If) {
- if_expr(p)?;
- } else {
- body(p)?;
- }
- }
-
- Ok(())
- })
-}
-
-/// Parse a while expresion.
-fn while_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::WhileLoop, |p| {
- p.assert(NodeKind::While);
- expr(p)?;
- body(p)
- })
-}
-
-/// Parse a for-in expression.
-fn for_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ForLoop, |p| {
- p.assert(NodeKind::For);
- for_pattern(p)?;
- p.expect(NodeKind::In)?;
- expr(p)?;
- body(p)
- })
-}
-
-/// Parse a for loop pattern.
-fn for_pattern(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ForPattern, |p| {
- ident(p)?;
- if p.eat_if(NodeKind::Comma) {
- ident(p)?;
- }
- Ok(())
- })
-}
-
-/// Parse an import expression.
-fn import_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ModuleImport, |p| {
- p.assert(NodeKind::Import);
-
- if !p.eat_if(NodeKind::Star) {
- // This is the list of identifiers scenario.
- p.perform(NodeKind::ImportItems, |p| {
- p.start_group(Group::Imports);
- let marker = p.marker();
- let items = collection(p, false).1;
- if items == 0 {
- p.expected("import items");
- }
- p.end_group();
-
- marker.filter_children(p, |n| match n.kind() {
- NodeKind::Ident(_) | NodeKind::Comma => Ok(()),
- _ => Err("expected identifier"),
- });
- });
- };
-
- p.expect(NodeKind::From)?;
- expr(p)
- })
-}
-
-/// Parse an include expression.
-fn include_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ModuleInclude, |p| {
- p.assert(NodeKind::Include);
- expr(p)
- })
-}
-
-/// Parse a break expression.
-fn break_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::BreakStmt, |p| {
- p.assert(NodeKind::Break);
- Ok(())
- })
-}
-
-/// Parse a continue expression.
-fn continue_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ContinueStmt, |p| {
- p.assert(NodeKind::Continue);
- Ok(())
- })
-}
-
-/// Parse a return expression.
-fn return_expr(p: &mut Parser) -> ParseResult {
- p.perform(NodeKind::ReturnStmt, |p| {
- p.assert(NodeKind::Return);
- if !p.at(NodeKind::Comma) && !p.eof() {
- expr(p)?;
- }
- Ok(())
- })
-}
-
-/// Parse a control flow body.
-fn body(p: &mut Parser) -> ParseResult {
- match p.peek() {
- Some(NodeKind::LeftBracket) => Ok(content_block(p)),
- Some(NodeKind::LeftBrace) => Ok(code_block(p)),
- _ => {
- p.expected("body");
- Err(ParseError)
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::fmt::Debug;
-
- #[track_caller]
- pub fn check<T>(text: &str, found: T, expected: T)
- where
- T: Debug + PartialEq,
- {
- if found != expected {
- println!("source: {text:?}");
- println!("expected: {expected:#?}");
- println!("found: {found:#?}");
- panic!("test failed");
- }
- }
-}
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
deleted file mode 100644
index 3dbb7d50..00000000
--- a/src/parse/parser.rs
+++ /dev/null
@@ -1,559 +0,0 @@
-use std::fmt::{self, Display, Formatter};
-use std::mem;
-use std::ops::Range;
-
-use super::{TokenMode, Tokens};
-use crate::syntax::{ErrorPos, InnerNode, NodeData, NodeKind, SyntaxNode};
-use crate::util::EcoString;
-
-/// A convenient token-based parser.
-pub struct Parser<'s> {
- /// An iterator over the source tokens.
- tokens: Tokens<'s>,
- /// Whether we are at the end of the file or of a group.
- eof: bool,
- /// The current token.
- current: Option<NodeKind>,
- /// The end byte index of the last non-trivia token.
- prev_end: usize,
- /// The start byte index of the peeked token.
- current_start: usize,
- /// The stack of open groups.
- groups: Vec<GroupEntry>,
- /// The children of the currently built node.
- children: Vec<SyntaxNode>,
- /// Whether the last group was not correctly terminated.
- unterminated_group: bool,
- /// Whether a group terminator was found that did not close a group.
- stray_terminator: bool,
-}
-
-impl<'s> Parser<'s> {
- /// Create a new parser for the source string.
- pub fn new(text: &'s str, mode: TokenMode) -> Self {
- Self::with_prefix("", text, mode)
- }
-
- /// Create a new parser for the source string that is prefixed by some text
- /// that does not need to be parsed but taken into account for column
- /// calculation.
- pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self {
- let mut tokens = Tokens::with_prefix(prefix, text, mode);
- let current = tokens.next();
- Self {
- tokens,
- eof: current.is_none(),
- current,
- prev_end: 0,
- current_start: 0,
- groups: vec![],
- children: vec![],
- unterminated_group: false,
- stray_terminator: false,
- }
- }
-
- /// End the parsing process and return the parsed children.
- pub fn finish(self) -> Vec<SyntaxNode> {
- self.children
- }
-
- /// End the parsing process and return
- /// - the parsed children and whether the last token was terminated, if all
- /// groups were terminated correctly, or
- /// - `None` otherwise.
- pub fn consume(self) -> Option<(Vec<SyntaxNode>, bool)> {
- self.terminated().then(|| (self.children, self.tokens.terminated()))
- }
-
- /// Create a new marker.
- pub fn marker(&mut self) -> Marker {
- Marker(self.children.len())
- }
-
- /// Create a marker right before the trailing trivia.
- pub fn trivia_start(&self) -> Marker {
- let count = self
- .children
- .iter()
- .rev()
- .take_while(|node| self.is_trivia(node.kind()))
- .count();
- Marker(self.children.len() - count)
- }
-
- /// Perform a subparse that wraps its result in a node with the given kind.
- pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T
- where
- F: FnOnce(&mut Self) -> T,
- {
- let prev = mem::take(&mut self.children);
- let output = f(self);
- let until = self.trivia_start();
- let mut children = mem::replace(&mut self.children, prev);
-
- if self.tokens.mode() == TokenMode::Markup {
- self.children.push(InnerNode::with_children(kind, children).into());
- } else {
- // Trailing trivia should not be wrapped into the new node.
- let idx = self.children.len();
- self.children.push(SyntaxNode::default());
- self.children.extend(children.drain(until.0 ..));
- self.children[idx] = InnerNode::with_children(kind, children).into();
- }
-
- output
- }
-
- /// Whether the end of the source string or group is reached.
- pub fn eof(&self) -> bool {
- self.eof
- }
-
- /// Consume the current token and also trailing trivia.
- pub fn eat(&mut self) {
- self.stray_terminator |= match self.current {
- Some(NodeKind::RightParen) => !self.inside(Group::Paren),
- Some(NodeKind::RightBracket) => !self.inside(Group::Bracket),
- Some(NodeKind::RightBrace) => !self.inside(Group::Brace),
- _ => false,
- };
-
- self.prev_end = self.tokens.cursor();
- self.bump();
-
- if self.tokens.mode() != TokenMode::Markup {
- // Skip whitespace and comments.
- while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) {
- self.bump();
- }
- }
-
- self.repeek();
- }
-
- /// Consume the current token if it is the given one.
- pub fn eat_if(&mut self, kind: NodeKind) -> bool {
- let at = self.at(kind);
- if at {
- self.eat();
- }
- at
- }
-
- /// Eat tokens while the condition is true.
- pub fn eat_while<F>(&mut self, mut f: F)
- where
- F: FnMut(&NodeKind) -> bool,
- {
- while self.peek().map_or(false, |t| f(t)) {
- self.eat();
- }
- }
-
- /// Consume the current token if it is the given one and produce an error if
- /// not.
- pub fn expect(&mut self, kind: NodeKind) -> ParseResult {
- let at = self.peek() == Some(&kind);
- if at {
- self.eat();
- Ok(())
- } else {
- self.expected(kind.name());
- Err(ParseError)
- }
- }
-
- /// Consume the current token, debug-asserting that it is the given one.
- #[track_caller]
- pub fn assert(&mut self, kind: NodeKind) {
- debug_assert_eq!(self.peek(), Some(&kind));
- self.eat();
- }
-
- /// Whether the current token is of the given type.
- pub fn at(&self, kind: NodeKind) -> bool {
- self.peek() == Some(&kind)
- }
-
- /// Peek at the current token without consuming it.
- pub fn peek(&self) -> Option<&NodeKind> {
- if self.eof { None } else { self.current.as_ref() }
- }
-
- /// Peek at the current token, but only if it follows immediately after the
- /// last one without any trivia in between.
- pub fn peek_direct(&self) -> Option<&NodeKind> {
- if self.prev_end() == self.current_start() {
- self.peek()
- } else {
- None
- }
- }
-
- /// Peek at the source of the current token.
- pub fn peek_src(&self) -> &'s str {
- self.get(self.current_start() .. self.current_end())
- }
-
- /// Obtain a range of the source code.
- pub fn get(&self, range: Range<usize>) -> &'s str {
- self.tokens.scanner().get(range)
- }
-
- /// The byte index at which the last non-trivia token ended.
- pub fn prev_end(&self) -> usize {
- self.prev_end
- }
-
- /// The byte index at which the current token starts.
- pub fn current_start(&self) -> usize {
- self.current_start
- }
-
- /// The byte index at which the current token ends.
- pub fn current_end(&self) -> usize {
- self.tokens.cursor()
- }
-
- /// Determine the column index for the given byte index.
- pub fn column(&self, index: usize) -> usize {
- self.tokens.column(index)
- }
-
- /// Continue parsing in a group.
- ///
- /// When the end delimiter of the group is reached, all subsequent calls to
- /// `peek()` return `None`. Parsing can only continue with a matching call
- /// to `end_group`.
- ///
- /// This panics if the current token does not start the given group.
- #[track_caller]
- pub fn start_group(&mut self, kind: Group) {
- self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() });
- self.tokens.set_mode(match kind {
- Group::Strong | Group::Emph => TokenMode::Markup,
- Group::Bracket => match self.tokens.mode() {
- TokenMode::Math => TokenMode::Math,
- _ => TokenMode::Markup,
- },
- Group::Brace | Group::Paren => match self.tokens.mode() {
- TokenMode::Math => TokenMode::Math,
- _ => TokenMode::Code,
- },
- Group::Math => TokenMode::Math,
- Group::Expr | Group::Imports => TokenMode::Code,
- });
-
- match kind {
- Group::Brace => self.assert(NodeKind::LeftBrace),
- Group::Bracket => self.assert(NodeKind::LeftBracket),
- Group::Paren => self.assert(NodeKind::LeftParen),
- Group::Strong => self.assert(NodeKind::Star),
- Group::Emph => self.assert(NodeKind::Underscore),
- Group::Math => self.assert(NodeKind::Dollar),
- Group::Expr => self.repeek(),
- Group::Imports => self.repeek(),
- }
- }
-
- /// End the parsing of a group.
- ///
- /// This panics if no group was started.
- #[track_caller]
- pub fn end_group(&mut self) {
- let group_mode = self.tokens.mode();
- let group = self.groups.pop().expect("no started group");
- self.tokens.set_mode(group.prev_mode);
-
- let mut rescan = self.tokens.mode() != group_mode;
-
- // Eat the end delimiter if there is one.
- if let Some((end, required)) = match group.kind {
- Group::Brace => Some((NodeKind::RightBrace, true)),
- Group::Bracket => Some((NodeKind::RightBracket, true)),
- Group::Paren => Some((NodeKind::RightParen, true)),
- Group::Strong => Some((NodeKind::Star, true)),
- Group::Emph => Some((NodeKind::Underscore, true)),
- Group::Math => Some((NodeKind::Dollar, true)),
- Group::Expr => Some((NodeKind::Semicolon, false)),
- Group::Imports => None,
- } {
- if self.current.as_ref() == Some(&end) {
- // If another group closes after a group with the missing
- // terminator, its scope of influence ends here and no longer
- // taints the rest of the reparse.
- self.unterminated_group = false;
-
- // Bump the delimeter and return. No need to rescan in this
- // case. Also, we know that the delimiter is not stray even
- // though we already removed the group.
- let s = self.stray_terminator;
- self.eat();
- self.stray_terminator = s;
- rescan = false;
- } else if required {
- self.expected(end.name());
- self.unterminated_group = true;
- }
- }
-
- // Rescan the peeked token if the mode changed.
- if rescan {
- let mut target = self.prev_end();
- if group_mode != TokenMode::Markup {
- let start = self.trivia_start().0;
- target = self.current_start
- - self.children[start ..].iter().map(SyntaxNode::len).sum::<usize>();
- self.children.truncate(start);
- }
-
- self.tokens.jump(target);
- self.prev_end = self.tokens.cursor();
- self.current_start = self.tokens.cursor();
- self.current = self.tokens.next();
- }
-
- self.repeek();
- }
-
- /// Checks if all groups were correctly terminated.
- fn terminated(&self) -> bool {
- self.groups.is_empty() && !self.unterminated_group && !self.stray_terminator
- }
-
- /// Low-level bump that consumes exactly one token without special trivia
- /// handling.
- fn bump(&mut self) {
- let kind = self.current.take().unwrap();
- let len = self.tokens.cursor() - self.current_start;
- self.children.push(NodeData::new(kind, len).into());
- self.current_start = self.tokens.cursor();
- self.current = self.tokens.next();
- }
-
- /// Take another look at the current token to recheck whether it ends a
- /// group.
- fn repeek(&mut self) {
- self.eof = match &self.current {
- Some(NodeKind::RightBrace) => self.inside(Group::Brace),
- Some(NodeKind::RightBracket) => self.inside(Group::Bracket),
- Some(NodeKind::RightParen) => self.inside(Group::Paren),
- Some(NodeKind::Star) => self.inside(Group::Strong),
- Some(NodeKind::Underscore) => self.inside(Group::Emph),
- Some(NodeKind::Dollar) => self.inside(Group::Math),
- Some(NodeKind::Semicolon) => self.inside(Group::Expr),
- Some(NodeKind::From) => self.inside(Group::Imports),
- Some(NodeKind::Space { newlines }) => self.space_ends_group(*newlines),
- Some(_) => false,
- None => true,
- };
- }
-
- /// Returns whether the given type can be skipped over.
- fn is_trivia(&self, token: &NodeKind) -> bool {
- match token {
- NodeKind::Space { newlines } => !self.space_ends_group(*newlines),
- NodeKind::LineComment => true,
- NodeKind::BlockComment => true,
- _ => false,
- }
- }
-
- /// Whether a space with the given number of newlines ends the current group.
- fn space_ends_group(&self, n: usize) -> bool {
- if n == 0 {
- return false;
- }
-
- match self.groups.last().map(|group| group.kind) {
- Some(Group::Strong | Group::Emph) => n >= 2,
- Some(Group::Imports) => n >= 1,
- Some(Group::Expr) if n >= 1 => {
- // Allow else and method call to continue on next line.
- self.groups.iter().nth_back(1).map(|group| group.kind)
- != Some(Group::Brace)
- || !matches!(
- self.tokens.clone().next(),
- Some(NodeKind::Else | NodeKind::Dot)
- )
- }
- _ => false,
- }
- }
-
- /// Whether we are inside the given group (can be nested).
- fn inside(&self, kind: Group) -> bool {
- self.groups
- .iter()
- .rev()
- .take_while(|g| !kind.is_weak() || g.kind.is_weak())
- .any(|g| g.kind == kind)
- }
-}
-
-/// Error handling.
-impl Parser<'_> {
- /// Eat the current token and add an error that it is unexpected.
- pub fn unexpected(&mut self) {
- if let Some(found) = self.peek() {
- let msg = format_eco!("unexpected {}", found.name());
- let error = NodeKind::Error(ErrorPos::Full, msg);
- self.perform(error, Self::eat);
- }
- }
-
- /// Add an error that the `thing` was expected at the end of the last
- /// non-trivia token.
- pub fn expected(&mut self, thing: &str) {
- self.expected_at(self.trivia_start(), thing);
- }
-
- /// Insert an error message that `what` was expected at the marker position.
- pub fn expected_at(&mut self, marker: Marker, what: &str) {
- let msg = format_eco!("expected {}", what);
- let error = NodeKind::Error(ErrorPos::Full, msg);
- self.children.insert(marker.0, NodeData::new(error, 0).into());
- }
-
- /// Eat the current token and add an error that it is not the expected
- /// `thing`.
- pub fn expected_found(&mut self, thing: &str) {
- match self.peek() {
- Some(found) => {
- let msg = format_eco!("expected {}, found {}", thing, found.name());
- let error = NodeKind::Error(ErrorPos::Full, msg);
- self.perform(error, Self::eat);
- }
- None => self.expected(thing),
- }
- }
-}
-
-/// Marks a location in a parser's child list.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub struct Marker(usize);
-
-impl Marker {
- /// Peek at the child directly before the marker.
- pub fn before<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> {
- p.children.get(self.0.checked_sub(1)?)
- }
-
- /// Peek at the child directly after the marker.
- pub fn after<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> {
- p.children.get(self.0)
- }
-
- /// Convert the child directly after marker.
- pub fn convert(self, p: &mut Parser, kind: NodeKind) {
- if let Some(child) = p.children.get_mut(self.0) {
- child.convert(kind);
- }
- }
-
- /// Perform a subparse that wraps all children after the marker in a node
- /// with the given kind.
- pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T
- where
- F: FnOnce(&mut Parser) -> T,
- {
- let success = f(p);
- self.end(p, kind);
- success
- }
-
- /// Wrap all children after the marker (excluding trailing trivia) in a node
- /// with the given `kind`.
- pub fn end(self, p: &mut Parser, kind: NodeKind) {
- let until = p.trivia_start().0.max(self.0);
- let children = p.children.drain(self.0 .. until).collect();
- p.children
- .insert(self.0, InnerNode::with_children(kind, children).into());
- }
-
- /// Wrap all children that do not fulfill the predicate in error nodes.
- pub fn filter_children<F>(self, p: &mut Parser, mut f: F)
- where
- F: FnMut(&SyntaxNode) -> Result<(), &'static str>,
- {
- for child in &mut p.children[self.0 ..] {
- // Don't expose errors.
- if child.kind().is_error() {
- continue;
- }
-
- // Don't expose trivia in code.
- if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() {
- continue;
- }
-
- if let Err(msg) = f(child) {
- let mut msg = EcoString::from(msg);
- if msg.starts_with("expected") {
- msg.push_str(", found ");
- msg.push_str(child.kind().name());
- }
- let error = NodeKind::Error(ErrorPos::Full, msg);
- let inner = mem::take(child);
- *child = InnerNode::with_child(error, inner).into();
- }
- }
- }
-}
-
-/// A logical group of tokens, e.g. `[...]`.
-#[derive(Debug)]
-struct GroupEntry {
- /// The kind of group this is. This decides which token(s) will end the
- /// group. For example, a [`Group::Paren`] will be ended by
- /// [`Token::RightParen`].
- pub kind: Group,
- /// The mode the parser was in _before_ the group started (to which we go
- /// back once the group ends).
- pub prev_mode: TokenMode,
-}
-
-/// A group, confined by optional start and end delimiters.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub enum Group {
- /// A curly-braced group: `{...}`.
- Brace,
- /// A bracketed group: `[...]`.
- Bracket,
- /// A parenthesized group: `(...)`.
- Paren,
- /// A group surrounded with stars: `*...*`.
- Strong,
- /// A group surrounded with underscore: `_..._`.
- Emph,
- /// A group surrounded by dollar signs: `$...$`.
- Math,
- /// A group ended by a semicolon or a line break: `;`, `\n`.
- Expr,
- /// A group for import items, ended by a semicolon, line break or `from`.
- Imports,
-}
-
-impl Group {
- /// Whether the group can only force other weak groups to end.
- fn is_weak(self) -> bool {
- matches!(self, Group::Strong | Group::Emph)
- }
-}
-
-/// Allows parser methods to use the try operator. Never returned top-level
-/// because the parser recovers from all errors.
-pub type ParseResult<T = ()> = Result<T, ParseError>;
-
-/// The error type for parsing.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub struct ParseError;
-
-impl Display for ParseError {
- fn fmt(&self, f: &mut Formatter) -> fmt::Result {
- f.pad("failed to parse")
- }
-}
-
-impl std::error::Error for ParseError {}
diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs
deleted file mode 100644
index 9fde0cf4..00000000
--- a/src/parse/resolve.rs
+++ /dev/null
@@ -1,238 +0,0 @@
-use unscanny::Scanner;
-
-use super::{is_ident, is_newline};
-use crate::syntax::RawKind;
-use crate::util::EcoString;
-
-/// Resolve all escape sequences in a string.
-pub fn resolve_string(string: &str) -> EcoString {
- let mut out = EcoString::with_capacity(string.len());
- let mut s = Scanner::new(string);
-
- while let Some(c) = s.eat() {
- if c != '\\' {
- out.push(c);
- continue;
- }
-
- let start = s.locate(-1);
- match s.eat() {
- Some('\\') => out.push('\\'),
- Some('"') => out.push('"'),
- Some('n') => out.push('\n'),
- Some('r') => out.push('\r'),
- Some('t') => out.push('\t'),
- Some('u') if s.eat_if('{') => {
- // TODO: Error if closing brace is missing.
- let sequence = s.eat_while(char::is_ascii_hexdigit);
- let _terminated = s.eat_if('}');
- match resolve_hex(sequence) {
- Some(c) => out.push(c),
- None => out.push_str(s.from(start)),
- }
- }
-
- _ => out.push_str(s.from(start)),
- }
- }
-
- out
-}
-
-/// Resolve a hexadecimal escape sequence into a character
-/// (only the inner hex letters without braces or `\u`).
-pub fn resolve_hex(sequence: &str) -> Option<char> {
- u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
-}
-
-/// Resolve the language tag and trim the raw text.
-pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawKind {
- if backticks > 1 {
- let (tag, inner) = split_at_lang_tag(text);
- let (text, block) = trim_and_split_raw(column, inner);
- RawKind {
- lang: is_ident(tag).then(|| tag.into()),
- text: text.into(),
- block,
- }
- } else {
- RawKind {
- lang: None,
- text: split_lines(text).join("\n").into(),
- block: false,
- }
- }
-}
-
-/// Parse the lang tag and return it alongside the remaining inner raw text.
-fn split_at_lang_tag(raw: &str) -> (&str, &str) {
- let mut s = Scanner::new(raw);
- (
- s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)),
- s.after(),
- )
-}
-
-/// Trim raw text and splits it into lines.
-///
-/// Also returns whether at least one newline was contained in `raw`.
-fn trim_and_split_raw(column: usize, mut raw: &str) -> (String, bool) {
- // Trims one space at the start.
- raw = raw.strip_prefix(' ').unwrap_or(raw);
-
- // Trim one space at the end if the last non-whitespace char is a backtick.
- if raw.trim_end().ends_with('`') {
- raw = raw.strip_suffix(' ').unwrap_or(raw);
- }
-
- let mut lines = split_lines(raw);
-
- // Dedent based on column, but not for the first line.
- for line in lines.iter_mut().skip(1) {
- let offset = line
- .chars()
- .take(column)
- .take_while(|c| c.is_whitespace())
- .map(char::len_utf8)
- .sum();
- *line = &line[offset ..];
- }
-
- let had_newline = lines.len() > 1;
- let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace);
-
- // Trims a sequence of whitespace followed by a newline at the start.
- if lines.first().map_or(false, is_whitespace) {
- lines.remove(0);
- }
-
- // Trims a newline followed by a sequence of whitespace at the end.
- if lines.last().map_or(false, is_whitespace) {
- lines.pop();
- }
-
- (lines.join("\n"), had_newline)
-}
-
-/// Split a string into a vector of lines
-/// (respecting Unicode, Unix, Mac and Windows line breaks).
-fn split_lines(text: &str) -> Vec<&str> {
- let mut s = Scanner::new(text);
- let mut lines = Vec::new();
- let mut start = 0;
- let mut end = 0;
-
- while let Some(c) = s.eat() {
- if is_newline(c) {
- if c == '\r' {
- s.eat_if('\n');
- }
-
- lines.push(&text[start .. end]);
- start = s.cursor();
- }
- end = s.cursor();
- }
-
- lines.push(&text[start ..]);
- lines
-}
-
-#[cfg(test)]
-#[rustfmt::skip]
-mod tests {
- use super::*;
-
- #[test]
- fn test_resolve_strings() {
- #[track_caller]
- fn test(string: &str, expected: &str) {
- assert_eq!(resolve_string(string), expected);
- }
-
- test(r#"hello world"#, "hello world");
- test(r#"hello\nworld"#, "hello\nworld");
- test(r#"a\"bc"#, "a\"bc");
- test(r#"a\u{2603}bc"#, "a☃bc");
- test(r#"a\u{26c3bg"#, "a𦰻g");
- test(r#"av\u{6797"#, "av林");
- test(r#"a\\"#, "a\\");
- test(r#"a\\\nbc"#, "a\\\nbc");
- test(r#"a\t\r\nbc"#, "a\t\r\nbc");
- test(r"🌎", "🌎");
- test(r"🌎\", r"🌎\");
- test(r"\🌎", r"\🌎");
- }
-
- #[test]
- fn test_split_at_lang_tag() {
- #[track_caller]
- fn test(text: &str, lang: &str, inner: &str) {
- assert_eq!(split_at_lang_tag(text), (lang, inner));
- }
-
- test("typst it!", "typst", " it!");
- test("typst\n it!", "typst", "\n it!");
- test("typst\n it!", "typst", "\n it!");
- test("abc`", "abc", "`");
- test(" hi", "", " hi");
- test("`", "", "`");
- }
-
- #[test]
- fn test_resolve_raw() {
- #[track_caller]
- fn test(
- column: usize,
- backticks: usize,
- raw: &str,
- lang: Option<&str>,
- text: &str,
- block: bool,
- ) {
- let node = resolve_raw(column, backticks, raw);
- assert_eq!(node.lang.as_deref(), lang);
- assert_eq!(node.text, text);
- assert_eq!(node.block, block);
- }
-
- // Just one backtick.
- test(0, 1, "py", None, "py", false);
- test(0, 1, "1\n2", None, "1\n2", false);
- test(0, 1, "1\r\n2", None, "1\n2", false);
-
- // More than one backtick with lang tag.
- test(0, 2, "js alert()", Some("js"), "alert()", false);
- test(0, 3, "py quit(\n\n)", Some("py"), "quit(\n\n)", true);
- test(0, 2, "♥", None, "", false);
-
- // Trimming of whitespace (tested more thoroughly in separate test).
- test(0, 2, " a", None, "a", false);
- test(0, 2, " a", None, " a", false);
- test(0, 2, " \na", None, "a", true);
-
- // Dedenting
- test(2, 3, " def foo():\n bar()", None, "def foo():\n bar()", true);
- }
-
- #[test]
- fn test_trim_raw() {
- #[track_caller]
- fn test(text: &str, expected: &str) {
- assert_eq!(trim_and_split_raw(0, text).0, expected);
- }
-
- test(" hi", "hi");
- test(" hi", " hi");
- test("\nhi", "hi");
- test(" \n hi", " hi");
- test("hi` ", "hi`");
- test("hi` ", "hi` ");
- test("hi` ", "hi` ");
- test("hi ", "hi ");
- test("hi ", "hi ");
- test("hi\n", "hi");
- test("hi \n ", "hi ");
- test(" \n hi \n ", " hi ");
- }
-}
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
deleted file mode 100644
index 73c64d1e..00000000
--- a/src/parse/tokens.rs
+++ /dev/null
@@ -1,1176 +0,0 @@
-use std::sync::Arc;
-
-use unicode_xid::UnicodeXID;
-use unscanny::Scanner;
-
-use super::resolve::{resolve_hex, resolve_raw, resolve_string};
-use crate::geom::{AngleUnit, LengthUnit};
-use crate::syntax::{ErrorPos, NodeKind, RawKind, Unit};
-use crate::util::EcoString;
-
-/// An iterator over the tokens of a string of source code.
-#[derive(Clone)]
-pub struct Tokens<'s> {
- /// The underlying scanner.
- s: Scanner<'s>,
- /// The mode the scanner is in. This determines what tokens it recognizes.
- mode: TokenMode,
- /// Whether the last token has been terminated.
- terminated: bool,
- /// Offsets the indentation on the first line of the source.
- column_offset: usize,
-}
-
-/// What kind of tokens to emit.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub enum TokenMode {
- /// Text and markup.
- Markup,
- /// Math atoms, operators, etc.
- Math,
- /// Keywords, literals and operators.
- Code,
-}
-
-impl<'s> Tokens<'s> {
- /// Create a new token iterator with the given mode.
- #[inline]
- pub fn new(text: &'s str, mode: TokenMode) -> Self {
- Self::with_prefix("", text, mode)
- }
-
- /// Create a new token iterator with the given mode and a prefix to offset
- /// column calculations.
- #[inline]
- pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self {
- Self {
- s: Scanner::new(text),
- mode,
- terminated: true,
- column_offset: column(prefix, prefix.len(), 0),
- }
- }
-
- /// Get the current token mode.
- #[inline]
- pub fn mode(&self) -> TokenMode {
- self.mode
- }
-
- /// Change the token mode.
- #[inline]
- pub fn set_mode(&mut self, mode: TokenMode) {
- self.mode = mode;
- }
-
- /// The index in the string at which the last token ends and next token
- /// will start.
- #[inline]
- pub fn cursor(&self) -> usize {
- self.s.cursor()
- }
-
- /// Jump to the given index in the string.
- #[inline]
- pub fn jump(&mut self, index: usize) {
- self.s.jump(index);
- }
-
- /// The underlying scanner.
- #[inline]
- pub fn scanner(&self) -> Scanner<'s> {
- self.s
- }
-
- /// Whether the last token was terminated.
- #[inline]
- pub fn terminated(&self) -> bool {
- self.terminated
- }
-
- /// The column index of a given index in the source string.
- #[inline]
- pub fn column(&self, index: usize) -> usize {
- column(self.s.string(), index, self.column_offset)
- }
-}
-
-impl<'s> Iterator for Tokens<'s> {
- type Item = NodeKind;
-
- /// Parse the next token in the source code.
- #[inline]
- fn next(&mut self) -> Option<Self::Item> {
- let start = self.s.cursor();
- let c = self.s.eat()?;
- Some(match c {
- // Trivia.
- '/' if self.s.eat_if('/') => self.line_comment(),
- '/' if self.s.eat_if('*') => self.block_comment(),
- '*' if self.s.eat_if('/') => {
- NodeKind::Error(ErrorPos::Full, "unexpected end of block comment".into())
- }
- c if c.is_whitespace() => self.whitespace(c),
-
- // Other things.
- _ => match self.mode {
- TokenMode::Markup => self.markup(start, c),
- TokenMode::Math => self.math(start, c),
- TokenMode::Code => self.code(start, c),
- },
- })
- }
-}
-
-impl<'s> Tokens<'s> {
- fn line_comment(&mut self) -> NodeKind {
- self.s.eat_until(is_newline);
- if self.s.peek().is_none() {
- self.terminated = false;
- }
- NodeKind::LineComment
- }
-
- fn block_comment(&mut self) -> NodeKind {
- let mut state = '_';
- let mut depth = 1;
- self.terminated = false;
-
- // Find the first `*/` that does not correspond to a nested `/*`.
- while let Some(c) = self.s.eat() {
- state = match (state, c) {
- ('*', '/') => {
- depth -= 1;
- if depth == 0 {
- self.terminated = true;
- break;
- }
- '_'
- }
- ('/', '*') => {
- depth += 1;
- '_'
- }
- ('/', '/') => {
- self.line_comment();
- '_'
- }
- _ => c,
- }
- }
-
- NodeKind::BlockComment
- }
-
- fn whitespace(&mut self, c: char) -> NodeKind {
- if c == ' ' && !self.s.at(char::is_whitespace) {
- return NodeKind::Space { newlines: 0 };
- }
-
- self.s.uneat();
-
- // Count the number of newlines.
- let mut newlines = 0;
- while let Some(c) = self.s.eat() {
- if !c.is_whitespace() {
- self.s.uneat();
- break;
- }
-
- if is_newline(c) {
- if c == '\r' {
- self.s.eat_if('\n');
- }
- newlines += 1;
- }
- }
-
- NodeKind::Space { newlines }
- }
-
- #[inline]
- fn markup(&mut self, start: usize, c: char) -> NodeKind {
- match c {
- // Blocks.
- '{' => NodeKind::LeftBrace,
- '}' => NodeKind::RightBrace,
- '[' => NodeKind::LeftBracket,
- ']' => NodeKind::RightBracket,
-
- // Multi-char things.
- '#' => self.hash(start),
- '.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'),
- '-' => self.hyph(),
- 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
- self.link(start)
- }
- '`' => self.raw(),
- c if c.is_ascii_digit() => self.numbering(start),
- '<' => self.label(),
- '@' => self.reference(start),
-
- // Escape sequences.
- '\\' => self.backslash(),
-
- // Single-char things.
- '~' => NodeKind::Shorthand('\u{00A0}'),
- '\'' => NodeKind::SmartQuote { double: false },
- '"' => NodeKind::SmartQuote { double: true },
- '*' if !self.in_word() => NodeKind::Star,
- '_' if !self.in_word() => NodeKind::Underscore,
- '$' => NodeKind::Dollar,
- '=' => NodeKind::Eq,
- '+' => NodeKind::Plus,
- '/' => NodeKind::Slash,
- ':' => NodeKind::Colon,
-
- // Plain text.
- _ => self.text(start),
- }
- }
-
- #[inline]
- fn text(&mut self, start: usize) -> NodeKind {
- macro_rules! table {
- ($(|$c:literal)*) => {{
- let mut t = [false; 128];
- $(t[$c as usize] = true;)*
- t
- }}
- }
-
- const TABLE: [bool; 128] = table! {
- | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
- | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
- | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
- };
-
- loop {
- self.s.eat_until(|c: char| {
- TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
- });
-
- // Continue with the same text node if the thing would become text
- // anyway.
- let mut s = self.s;
- match s.eat() {
- Some('/') if !s.at(['/', '*']) => {}
- Some(' ') if s.at(char::is_alphanumeric) => {}
- Some('-') if !s.at(['-', '?']) => {}
- Some('.') if !s.at("..") => {}
- Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
- Some('@' | '#') if !s.at(is_id_start) => {}
- _ => break,
- }
-
- self.s = s;
- }
-
- NodeKind::Text(self.s.from(start).into())
- }
-
- fn backslash(&mut self) -> NodeKind {
- match self.s.peek() {
- Some('u') if self.s.eat_if("u{") => {
- let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
- if self.s.eat_if('}') {
- if let Some(c) = resolve_hex(sequence) {
- NodeKind::Escape(c)
- } else {
- NodeKind::Error(
- ErrorPos::Full,
- "invalid unicode escape sequence".into(),
- )
- }
- } else {
- self.terminated = false;
- NodeKind::Error(ErrorPos::End, "expected closing brace".into())
- }
- }
-
- // Linebreaks.
- Some(c) if c.is_whitespace() => NodeKind::Linebreak,
- None => NodeKind::Linebreak,
-
- // Escapes.
- Some(c) => {
- self.s.expect(c);
- NodeKind::Escape(c)
- }
- }
- }
-
- fn hash(&mut self, start: usize) -> NodeKind {
- if self.s.at(is_id_start) {
- let read = self.s.eat_while(is_id_continue);
- match keyword(read) {
- Some(keyword) => keyword,
- None => NodeKind::Ident(read.into()),
- }
- } else {
- self.text(start)
- }
- }
-
- fn hyph(&mut self) -> NodeKind {
- if self.s.eat_if('-') {
- if self.s.eat_if('-') {
- NodeKind::Shorthand('\u{2014}')
- } else {
- NodeKind::Shorthand('\u{2013}')
- }
- } else if self.s.eat_if('?') {
- NodeKind::Shorthand('\u{00AD}')
- } else {
- NodeKind::Minus
- }
- }
-
- fn link(&mut self, start: usize) -> NodeKind {
- #[rustfmt::skip]
- self.s.eat_while(|c: char| matches!(c,
- | '0' ..= '9'
- | 'a' ..= 'z'
- | 'A' ..= 'Z'
- | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
- | '\'' | '.' | ',' | ';'
- ));
- if self.s.scout(-1) == Some('.') {
- self.s.uneat();
- }
- NodeKind::Link(self.s.from(start).into())
- }
-
- fn raw(&mut self) -> NodeKind {
- let column = self.column(self.s.cursor() - 1);
-
- let mut backticks = 1;
- while self.s.eat_if('`') {
- backticks += 1;
- }
-
- // Special case for empty inline block.
- if backticks == 2 {
- return NodeKind::Raw(Arc::new(RawKind {
- text: EcoString::new(),
- lang: None,
- block: false,
- }));
- }
-
- let start = self.s.cursor();
- let mut found = 0;
- while found < backticks {
- match self.s.eat() {
- Some('`') => found += 1,
- Some(_) => found = 0,
- None => break,
- }
- }
-
- if found == backticks {
- let end = self.s.cursor() - found as usize;
- NodeKind::Raw(Arc::new(resolve_raw(
- column,
- backticks,
- self.s.get(start .. end),
- )))
- } else {
- self.terminated = false;
- let remaining = backticks - found;
- let noun = if remaining == 1 { "backtick" } else { "backticks" };
- NodeKind::Error(
- ErrorPos::End,
- if found == 0 {
- format_eco!("expected {} {}", remaining, noun)
- } else {
- format_eco!("expected {} more {}", remaining, noun)
- },
- )
- }
- }
-
- fn numbering(&mut self, start: usize) -> NodeKind {
- self.s.eat_while(char::is_ascii_digit);
- let read = self.s.from(start);
- if self.s.eat_if('.') {
- if let Ok(number) = read.parse() {
- return NodeKind::EnumNumbering(number);
- }
- }
-
- self.text(start)
- }
-
- fn label(&mut self) -> NodeKind {
- let label = self.s.eat_while(is_id_continue);
- if self.s.eat_if('>') {
- if !label.is_empty() {
- NodeKind::Label(label.into())
- } else {
- NodeKind::Error(ErrorPos::Full, "label cannot be empty".into())
- }
- } else {
- self.terminated = false;
- NodeKind::Error(ErrorPos::End, "expected closing angle bracket".into())
- }
- }
-
- fn reference(&mut self, start: usize) -> NodeKind {
- let label = self.s.eat_while(is_id_continue);
- if !label.is_empty() {
- NodeKind::Ref(label.into())
- } else {
- self.text(start)
- }
- }
-
- fn math(&mut self, start: usize, c: char) -> NodeKind {
- match c {
- // Escape sequences.
- '\\' => self.backslash(),
-
- // Single-char things.
- '_' => NodeKind::Underscore,
- '^' => NodeKind::Hat,
- '/' => NodeKind::Slash,
- '&' => NodeKind::Amp,
- '$' => NodeKind::Dollar,
-
- // Brackets.
- '{' => NodeKind::LeftBrace,
- '}' => NodeKind::RightBrace,
- '[' => NodeKind::LeftBracket,
- ']' => NodeKind::RightBracket,
- '(' => NodeKind::LeftParen,
- ')' => NodeKind::RightParen,
-
- // Identifiers.
- c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
- self.s.eat_while(is_math_id_continue);
- NodeKind::Ident(self.s.from(start).into())
- }
-
- // Numbers.
- c if c.is_numeric() => {
- self.s.eat_while(char::is_numeric);
- NodeKind::Atom(self.s.from(start).into())
- }
-
- // Other math atoms.
- c => NodeKind::Atom(c.into()),
- }
- }
-
- fn code(&mut self, start: usize, c: char) -> NodeKind {
- match c {
- // Blocks.
- '{' => NodeKind::LeftBrace,
- '}' => NodeKind::RightBrace,
- '[' => NodeKind::LeftBracket,
- ']' => NodeKind::RightBracket,
-
- // Parentheses.
- '(' => NodeKind::LeftParen,
- ')' => NodeKind::RightParen,
-
- // Two-char operators.
- '=' if self.s.eat_if('=') => NodeKind::EqEq,
- '!' if self.s.eat_if('=') => NodeKind::ExclEq,
- '<' if self.s.eat_if('=') => NodeKind::LtEq,
- '>' if self.s.eat_if('=') => NodeKind::GtEq,
- '+' if self.s.eat_if('=') => NodeKind::PlusEq,
- '-' if self.s.eat_if('=') => NodeKind::HyphEq,
- '*' if self.s.eat_if('=') => NodeKind::StarEq,
- '/' if self.s.eat_if('=') => NodeKind::SlashEq,
- '.' if self.s.eat_if('.') => NodeKind::Dots,
- '=' if self.s.eat_if('>') => NodeKind::Arrow,
-
- // Single-char operators.
- ',' => NodeKind::Comma,
- ';' => NodeKind::Semicolon,
- ':' => NodeKind::Colon,
- '+' => NodeKind::Plus,
- '-' => NodeKind::Minus,
- '*' => NodeKind::Star,
- '/' => NodeKind::Slash,
- '=' => NodeKind::Eq,
- '<' => NodeKind::Lt,
- '>' => NodeKind::Gt,
- '.' if !self.s.at(char::is_ascii_digit) => NodeKind::Dot,
-
- // Identifiers.
- c if is_id_start(c) => self.ident(start),
-
- // Numbers.
- c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
- self.number(start, c)
- }
-
- // Strings.
- '"' => self.string(),
-
- // Invalid token.
- _ => NodeKind::Error(ErrorPos::Full, "not valid here".into()),
- }
- }
-
- fn ident(&mut self, start: usize) -> NodeKind {
- self.s.eat_while(is_id_continue);
- match self.s.from(start) {
- "none" => NodeKind::None,
- "auto" => NodeKind::Auto,
- "true" => NodeKind::Bool(true),
- "false" => NodeKind::Bool(false),
- id => keyword(id).unwrap_or_else(|| NodeKind::Ident(id.into())),
- }
- }
-
- fn number(&mut self, start: usize, c: char) -> NodeKind {
- // Read the first part (integer or fractional depending on `first`).
- self.s.eat_while(char::is_ascii_digit);
-
- // Read the fractional part if not already done.
- // Make sure not to confuse a range for the decimal separator.
- if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
- self.s.eat_while(char::is_ascii_digit);
- }
-
- // Read the exponent.
- if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
- self.s.eat_if(['+', '-']);
- self.s.eat_while(char::is_ascii_digit);
- }
-
- // Read the suffix.
- let suffix_start = self.s.cursor();
- if !self.s.eat_if('%') {
- self.s.eat_while(char::is_ascii_alphanumeric);
- }
-
- let number = self.s.get(start .. suffix_start);
- let suffix = self.s.from(suffix_start);
-
- // Find out whether it is a simple number.
- if suffix.is_empty() {
- if let Ok(i) = number.parse::<i64>() {
- return NodeKind::Int(i);
- }
- }
-
- let v = match number.parse::<f64>() {
- Ok(v) => v,
- Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()),
- };
-
- match suffix {
- "" => NodeKind::Float(v),
- "pt" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Pt)),
- "mm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Mm)),
- "cm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Cm)),
- "in" => NodeKind::Numeric(v, Unit::Length(LengthUnit::In)),
- "deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)),
- "rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)),
- "em" => NodeKind::Numeric(v, Unit::Em),
- "fr" => NodeKind::Numeric(v, Unit::Fr),
- "%" => NodeKind::Numeric(v, Unit::Percent),
- _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()),
- }
- }
-
- fn string(&mut self) -> NodeKind {
- let mut escaped = false;
- let verbatim = self.s.eat_until(|c| {
- if c == '"' && !escaped {
- true
- } else {
- escaped = c == '\\' && !escaped;
- false
- }
- });
-
- let string = resolve_string(verbatim);
- if self.s.eat_if('"') {
- NodeKind::Str(string)
- } else {
- self.terminated = false;
- NodeKind::Error(ErrorPos::End, "expected quote".into())
- }
- }
-
- fn in_word(&self) -> bool {
- let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
- let prev = self.s.scout(-2);
- let next = self.s.peek();
- alphanumeric(prev) && alphanumeric(next)
- }
-}
-
-fn keyword(ident: &str) -> Option<NodeKind> {
- Some(match ident {
- "not" => NodeKind::Not,
- "and" => NodeKind::And,
- "or" => NodeKind::Or,
- "let" => NodeKind::Let,
- "set" => NodeKind::Set,
- "show" => NodeKind::Show,
- "wrap" => NodeKind::Wrap,
- "if" => NodeKind::If,
- "else" => NodeKind::Else,
- "for" => NodeKind::For,
- "in" => NodeKind::In,
- "as" => NodeKind::As,
- "while" => NodeKind::While,
- "break" => NodeKind::Break,
- "continue" => NodeKind::Continue,
- "return" => NodeKind::Return,
- "import" => NodeKind::Import,
- "include" => NodeKind::Include,
- "from" => NodeKind::From,
- _ => return None,
- })
-}
-
-/// The column index of a given index in the source string, given a column
-/// offset for the first line.
-#[inline]
-fn column(string: &str, index: usize, offset: usize) -> usize {
- let mut apply_offset = false;
- let res = string[.. index]
- .char_indices()
- .rev()
- .take_while(|&(_, c)| !is_newline(c))
- .inspect(|&(i, _)| {
- if i == 0 {
- apply_offset = true
- }
- })
- .count();
-
- // The loop is never executed if the slice is empty, but we are of
- // course still at the start of the first line.
- if index == 0 {
- apply_offset = true;
- }
-
- if apply_offset { res + offset } else { res }
-}
-
-/// Whether this character denotes a newline.
-#[inline]
-pub fn is_newline(character: char) -> bool {
- matches!(
- character,
- // Line Feed, Vertical Tab, Form Feed, Carriage Return.
- '\n' | '\x0B' | '\x0C' | '\r' |
- // Next Line, Line Separator, Paragraph Separator.
- '\u{0085}' | '\u{2028}' | '\u{2029}'
- )
-}
-
-/// Whether a string is a valid unicode identifier.
-///
-/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
-/// - `_` as a starting character,
-/// - `_` and `-` as continuing characters.
-///
-/// [uax31]: http://www.unicode.org/reports/tr31/
-#[inline]
-pub fn is_ident(string: &str) -> bool {
- let mut chars = string.chars();
- chars
- .next()
- .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
-}
-
-/// Whether a character can start an identifier.
-#[inline]
-fn is_id_start(c: char) -> bool {
- c.is_xid_start() || c == '_'
-}
-
-/// Whether a character can continue an identifier.
-#[inline]
-fn is_id_continue(c: char) -> bool {
- c.is_xid_continue() || c == '_' || c == '-'
-}
-
-/// Whether a character can start an identifier in math.
-#[inline]
-fn is_math_id_start(c: char) -> bool {
- c.is_xid_start()
-}
-
-/// Whether a character can continue an identifier in math.
-#[inline]
-fn is_math_id_continue(c: char) -> bool {
- c.is_xid_continue() && c != '_'
-}
-
-#[cfg(test)]
-#[allow(non_snake_case)]
-mod tests {
- use super::*;
- use crate::parse::tests::check;
-
- use ErrorPos::*;
- use NodeKind::*;
- use Option::None;
- use TokenMode::{Code, Markup};
-
- fn Space(newlines: usize) -> NodeKind {
- NodeKind::Space { newlines }
- }
-
- fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind {
- NodeKind::Raw(Arc::new(RawKind {
- text: text.into(),
- lang: lang.map(Into::into),
- block,
- }))
- }
-
- fn Str(string: &str) -> NodeKind {
- NodeKind::Str(string.into())
- }
-
- fn Text(string: &str) -> NodeKind {
- NodeKind::Text(string.into())
- }
-
- fn Ident(ident: &str) -> NodeKind {
- NodeKind::Ident(ident.into())
- }
-
- fn Error(pos: ErrorPos, message: &str) -> NodeKind {
- NodeKind::Error(pos, message.into())
- }
-
- /// Building blocks for suffix testing.
- ///
- /// We extend each test case with a collection of different suffixes to make
- /// sure tokens end at the correct position. These suffixes are split into
- /// blocks, which can be disabled/enabled per test case. For example, when
- /// testing identifiers we disable letter suffixes because these would
- /// mingle with the identifiers.
- ///
- /// Suffix blocks:
- /// - ' ': spacing
- /// - 'a': letters
- /// - '1': numbers
- /// - '/': symbols
- const BLOCKS: &str = " a1/";
-
- // Suffixes described by four-tuples of:
- //
- // - block the suffix is part of
- // - mode in which the suffix is applicable
- // - the suffix string
- // - the resulting suffix NodeKind
- fn suffixes()
- -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> {
- [
- // Whitespace suffixes.
- (' ', None, " ", Space(0)),
- (' ', None, "\n", Space(1)),
- (' ', None, "\r", Space(1)),
- (' ', None, "\r\n", Space(1)),
- // Letter suffixes.
- ('a', Some(Markup), "hello", Text("hello")),
- ('a', Some(Markup), "💚", Text("💚")),
- ('a', Some(Code), "val", Ident("val")),
- ('a', Some(Code), "α", Ident("α")),
- ('a', Some(Code), "_", Ident("_")),
- // Number suffixes.
- ('1', Some(Code), "2", Int(2)),
- ('1', Some(Code), ".2", Float(0.2)),
- // Symbol suffixes.
- ('/', None, "[", LeftBracket),
- ('/', None, "//", LineComment),
- ('/', None, "/**/", BlockComment),
- ('/', Some(Markup), "*", Star),
- ('/', Some(Markup), r"\\", Escape('\\')),
- ('/', Some(Markup), "#let", Let),
- ('/', Some(Code), "(", LeftParen),
- ('/', Some(Code), ":", Colon),
- ('/', Some(Code), "+=", PlusEq),
- ]
- .into_iter()
- }
-
- macro_rules! t {
- (Both $($tts:tt)*) => {
- t!(Markup $($tts)*);
- t!(Code $($tts)*);
- };
- ($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{
- // Test without suffix.
- t!(@$mode: $text => $($token),*);
-
- // Test with each applicable suffix.
- for (block, mode, suffix, ref token) in suffixes() {
- let text = $text;
- #[allow(unused_variables)]
- let blocks = BLOCKS;
- $(let blocks = $blocks;)?
- assert!(!blocks.contains(|c| !BLOCKS.contains(c)));
- if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) {
- t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token);
- }
- }
- }};
- (@$mode:ident: $text:expr => $($token:expr),*) => {{
- let text = $text;
- let found = Tokens::new(&text, $mode).collect::<Vec<_>>();
- let expected = vec![$($token.clone()),*];
- check(&text, found, expected);
- }};
- }
-
- #[test]
- fn test_tokenize_brackets() {
- // Test in markup.
- t!(Markup: "{" => LeftBrace);
- t!(Markup: "}" => RightBrace);
- t!(Markup: "[" => LeftBracket);
- t!(Markup: "]" => RightBracket);
- t!(Markup[" /"]: "(" => Text("("));
- t!(Markup[" /"]: ")" => Text(")"));
-
- // Test in code.
- t!(Code: "{" => LeftBrace);
- t!(Code: "}" => RightBrace);
- t!(Code: "[" => LeftBracket);
- t!(Code: "]" => RightBracket);
- t!(Code: "(" => LeftParen);
- t!(Code: ")" => RightParen);
- }
-
- #[test]
- fn test_tokenize_whitespace() {
- // Test basic whitespace.
- t!(Both["a1/"]: "" => );
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: "\t" => Space(0));
- t!(Both["a1/"]: " \t" => Space(0));
- t!(Both["a1/"]: "\u{202F}" => Space(0));
-
- // Test newline counting.
- t!(Both["a1/"]: "\n" => Space(1));
- t!(Both["a1/"]: "\n " => Space(1));
- t!(Both["a1/"]: " \n" => Space(1));
- t!(Both["a1/"]: " \n " => Space(1));
- t!(Both["a1/"]: "\r\n" => Space(1));
- t!(Both["a1/"]: "\r\n\r" => Space(2));
- t!(Both["a1/"]: " \n\t \n " => Space(2));
- t!(Both["a1/"]: "\n\r" => Space(2));
- t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
- }
-
- #[test]
- fn test_tokenize_text() {
- // Test basic text.
- t!(Markup[" /"]: "hello" => Text("hello"));
- t!(Markup[" /"]: "reha-world" => Text("reha-world"));
-
- // Test code symbols in text.
- t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b"));
- t!(Markup[" /"]: ";,|/+" => Text(";,|/+"));
- t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a"));
- t!(Markup[" "]: "#123" => Text("#123"));
-
- // Test text ends.
- t!(Markup[""]: "hello " => Text("hello"), Space(0));
- t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}'));
- }
-
- #[test]
- fn test_tokenize_escape_sequences() {
- // Test escapable symbols.
- t!(Markup: r"\\" => Escape('\\'));
- t!(Markup: r"\/" => Escape('/'));
- t!(Markup: r"\[" => Escape('['));
- t!(Markup: r"\]" => Escape(']'));
- t!(Markup: r"\{" => Escape('{'));
- t!(Markup: r"\}" => Escape('}'));
- t!(Markup: r"\*" => Escape('*'));
- t!(Markup: r"\_" => Escape('_'));
- t!(Markup: r"\=" => Escape('='));
- t!(Markup: r"\~" => Escape('~'));
- t!(Markup: r"\'" => Escape('\''));
- t!(Markup: r#"\""# => Escape('"'));
- t!(Markup: r"\`" => Escape('`'));
- t!(Markup: r"\$" => Escape('$'));
- t!(Markup: r"\#" => Escape('#'));
- t!(Markup: r"\a" => Escape('a'));
- t!(Markup: r"\u" => Escape('u'));
- t!(Markup: r"\1" => Escape('1'));
-
- // Test basic unicode escapes.
- t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence"));
- t!(Markup: r"\u{2603}" => Escape('☃'));
- t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence"));
-
- // Test unclosed unicode escapes.
- t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace"));
- t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace"));
- t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace"));
- t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace"));
- t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace);
- }
-
- #[test]
- fn test_tokenize_markup_symbols() {
- // Test markup tokens.
- t!(Markup[" a1"]: "*" => Star);
- t!(Markup: "_" => Underscore);
- t!(Markup[""]: "===" => Eq, Eq, Eq);
- t!(Markup["a1/"]: "= " => Eq, Space(0));
- t!(Markup[" "]: r"\" => Linebreak);
- t!(Markup: "~" => Shorthand('\u{00A0}'));
- t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}'));
- t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}'));
- t!(Markup["a1/"]: "- " => Minus, Space(0));
- t!(Markup[" "]: "+" => Plus);
- t!(Markup[" "]: "1." => EnumNumbering(1));
- t!(Markup[" "]: "1.a" => EnumNumbering(1), Text("a"));
- t!(Markup[" /"]: "a1." => Text("a1."));
- }
-
- #[test]
- fn test_tokenize_code_symbols() {
- // Test all symbols.
- t!(Code: "," => Comma);
- t!(Code: ";" => Semicolon);
- t!(Code: ":" => Colon);
- t!(Code: "+" => Plus);
- t!(Code: "-" => Minus);
- t!(Code[" a1"]: "*" => Star);
- t!(Code[" a1"]: "/" => Slash);
- t!(Code[" a/"]: "." => Dot);
- t!(Code: "=" => Eq);
- t!(Code: "==" => EqEq);
- t!(Code: "!=" => ExclEq);
- t!(Code: "<" => Lt);
- t!(Code: "<=" => LtEq);
- t!(Code: ">" => Gt);
- t!(Code: ">=" => GtEq);
- t!(Code: "+=" => PlusEq);
- t!(Code: "-=" => HyphEq);
- t!(Code: "*=" => StarEq);
- t!(Code: "/=" => SlashEq);
- t!(Code: ".." => Dots);
- t!(Code: "=>" => Arrow);
-
- // Test combinations.
- t!(Code: "<=>" => LtEq, Gt);
- t!(Code[" a/"]: "..." => Dots, Dot);
-
- // Test hyphen as symbol vs part of identifier.
- t!(Code[" /"]: "-1" => Minus, Int(1));
- t!(Code[" /"]: "-a" => Minus, Ident("a"));
- t!(Code[" /"]: "--1" => Minus, Minus, Int(1));
- t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a"));
- t!(Code[" /"]: "a-b" => Ident("a-b"));
-
- // Test invalid.
- t!(Code: r"\" => Error(Full, "not valid here"));
- }
-
- #[test]
- fn test_tokenize_keywords() {
- // A list of a few (not all) keywords.
- let list = [
- ("not", Not),
- ("let", Let),
- ("if", If),
- ("else", Else),
- ("for", For),
- ("in", In),
- ("import", Import),
- ];
-
- for (s, t) in list.clone() {
- t!(Markup[" "]: format!("#{}", s) => t);
- t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
- t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}")));
- }
-
- for (s, t) in list {
- t!(Code[" "]: s => t);
- t!(Markup[" /"]: s => Text(s));
- }
-
- // Test simple identifier.
- t!(Markup[" "]: "#letter" => Ident("letter"));
- t!(Code[" /"]: "falser" => Ident("falser"));
- t!(Code[" /"]: "None" => Ident("None"));
- t!(Code[" /"]: "True" => Ident("True"));
- }
-
- #[test]
- fn test_tokenize_raw_blocks() {
- // Test basic raw block.
- t!(Markup: "``" => Raw("", None, false));
- t!(Markup: "`raw`" => Raw("raw", None, false));
- t!(Markup[""]: "`]" => Error(End, "expected 1 backtick"));
-
- // Test special symbols in raw block.
- t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false));
- t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick"));
-
- // Test separated closing backticks.
- t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false));
-
- // Test more backticks.
- t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false));
- t!(Markup: "````🚀````" => Raw("", None, false));
- t!(Markup[""]: "`````👩‍🚀````noend" => Error(End, "expected 5 backticks"));
- t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false));
- }
-
- #[test]
- fn test_tokenize_idents() {
- // Test valid identifiers.
- t!(Code[" /"]: "x" => Ident("x"));
- t!(Code[" /"]: "value" => Ident("value"));
- t!(Code[" /"]: "__main__" => Ident("__main__"));
- t!(Code[" /"]: "_snake_case" => Ident("_snake_case"));
-
- // Test non-ascii.
- t!(Code[" /"]: "α" => Ident("α"));
- t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ"));
-
- // Test hyphen parsed as identifier.
- t!(Code[" /"]: "kebab-case" => Ident("kebab-case"));
- t!(Code[" /"]: "one-10" => Ident("one-10"));
- }
-
- #[test]
- fn test_tokenize_numeric() {
- let ints = [("7", 7), ("012", 12)];
- let floats = [
- (".3", 0.3),
- ("0.3", 0.3),
- ("3.", 3.0),
- ("3.0", 3.0),
- ("14.3", 14.3),
- ("10e2", 1000.0),
- ("10e+0", 10.0),
- ("10e+1", 100.0),
- ("10e-2", 0.1),
- ("10.e1", 100.0),
- ("10.e-1", 1.0),
- (".1e1", 1.0),
- ("10E2", 1000.0),
- ];
-
- // Test integers.
- for &(s, v) in &ints {
- t!(Code[" /"]: s => Int(v));
- }
-
- // Test floats.
- for &(s, v) in &floats {
- t!(Code[" /"]: s => Float(v));
- }
-
- // Test attached numbers.
- t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3));
- t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3));
- t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3));
-
- // Test float from too large integer.
- let large = i64::MAX as f64 + 1.0;
- t!(Code[" /"]: large.to_string() => Float(large));
-
- // Combined integers and floats.
- let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats);
-
- let suffixes: &[(&str, fn(f64) -> NodeKind)] = &[
- ("mm", |x| Numeric(x, Unit::Length(LengthUnit::Mm))),
- ("pt", |x| Numeric(x, Unit::Length(LengthUnit::Pt))),
- ("cm", |x| Numeric(x, Unit::Length(LengthUnit::Cm))),
- ("in", |x| Numeric(x, Unit::Length(LengthUnit::In))),
- ("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))),
- ("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))),
- ("em", |x| Numeric(x, Unit::Em)),
- ("fr", |x| Numeric(x, Unit::Fr)),
- ("%", |x| Numeric(x, Unit::Percent)),
- ];
-
- // Numeric types.
- for &(suffix, build) in suffixes {
- for (s, v) in nums.clone() {
- t!(Code[" /"]: format!("{}{}", s, suffix) => build(v));
- }
- }
-
- // Multiple dots close the number.
- t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2));
- t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3));
- t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3));
-
- // Test invalid.
- t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix"));
- }
-
- #[test]
- fn test_tokenize_strings() {
- // Test basic strings.
- t!(Code: "\"hi\"" => Str("hi"));
- t!(Code: "\"hi\nthere\"" => Str("hi\nthere"));
- t!(Code: "\"🌎\"" => Str("🌎"));
-
- // Test unterminated.
- t!(Code[""]: "\"hi" => Error(End, "expected quote"));
-
- // Test escaped quote.
- t!(Code: r#""a\"bc""# => Str("a\"bc"));
- t!(Code[""]: r#""\""# => Error(End, "expected quote"));
- }
-
- #[test]
- fn test_tokenize_line_comments() {
- // Test line comment with no trailing newline.
- t!(Both[""]: "//" => LineComment);
-
- // Test line comment ends at newline.
- t!(Both["a1/"]: "//bc\n" => LineComment, Space(1));
- t!(Both["a1/"]: "// bc \n" => LineComment, Space(1));
- t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1));
-
- // Test nested line comments.
- t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1));
- }
-
- #[test]
- fn test_tokenize_block_comments() {
- // Test basic block comments.
- t!(Both[""]: "/*" => BlockComment);
- t!(Both: "/**/" => BlockComment);
- t!(Both: "/*🏞*/" => BlockComment);
- t!(Both: "/*\n*/" => BlockComment);
-
- // Test depth 1 and 2 nested block comments.
- t!(Both: "/* /* */ */" => BlockComment);
- t!(Both: "/*/*/**/*/*/" => BlockComment);
-
- // Test two nested, one unclosed block comments.
- t!(Both[""]: "/*/*/**/*/" => BlockComment);
-
- // Test all combinations of up to two following slashes and stars.
- t!(Both[""]: "/*" => BlockComment);
- t!(Both[""]: "/*/" => BlockComment);
- t!(Both[""]: "/**" => BlockComment);
- t!(Both[""]: "/*//" => BlockComment);
- t!(Both[""]: "/*/*" => BlockComment);
- t!(Both[""]: "/**/" => BlockComment);
- t!(Both[""]: "/***" => BlockComment);
-
- // Test unexpected terminator.
- t!(Both: "/*Hi*/*/" => BlockComment,
- Error(Full, "unexpected end of block comment"));
- }
-}