diff options
Diffstat (limited to 'src/syntax')
| -rw-r--r-- | src/syntax/highlight.rs | 6 | ||||
| -rw-r--r-- | src/syntax/incremental.rs | 518 | ||||
| -rw-r--r-- | src/syntax/mod.rs | 567 | ||||
| -rw-r--r-- | src/syntax/node.rs | 548 | ||||
| -rw-r--r-- | src/syntax/parser.rs | 558 | ||||
| -rw-r--r-- | src/syntax/parsing.rs | 1140 | ||||
| -rw-r--r-- | src/syntax/resolve.rs | 237 | ||||
| -rw-r--r-- | src/syntax/source.rs | 430 | ||||
| -rw-r--r-- | src/syntax/span.rs | 6 | ||||
| -rw-r--r-- | src/syntax/tokens.rs | 1176 |
10 files changed, 4638 insertions, 548 deletions
diff --git a/src/syntax/highlight.rs b/src/syntax/highlight.rs index bfb36078..325b7274 100644 --- a/src/syntax/highlight.rs +++ b/src/syntax/highlight.rs @@ -6,7 +6,7 @@ use std::ops::Range; use syntect::highlighting::{Color, FontStyle, Highlighter, Style, Theme}; use syntect::parsing::Scope; -use super::{NodeKind, SyntaxNode}; +use super::{parse, NodeKind, SyntaxNode}; /// Highlight source text into a standalone HTML document. pub fn highlight_html(text: &str, theme: &Theme) -> String { @@ -28,7 +28,7 @@ pub fn highlight_pre(text: &str, theme: &Theme) -> String { let mut buf = String::new(); buf.push_str("<pre>\n"); - let root = crate::parse::parse(text); + let root = parse(text); highlight_themed(&root, theme, |range, style| { let styled = style != Style::default(); if styled { @@ -401,8 +401,8 @@ impl Category { #[cfg(test)] mod tests { + use super::super::Source; use super::*; - use crate::source::Source; #[test] fn test_highlighting() { diff --git a/src/syntax/incremental.rs b/src/syntax/incremental.rs new file mode 100644 index 00000000..529defd7 --- /dev/null +++ b/src/syntax/incremental.rs @@ -0,0 +1,518 @@ +use std::ops::Range; +use std::sync::Arc; + +use super::{ + is_newline, parse, reparse_code_block, reparse_content_block, + reparse_markup_elements, InnerNode, NodeKind, Span, SyntaxNode, +}; + +/// Refresh the given syntax node with as little parsing as possible. +/// +/// Takes the new source, the range in the old source that was replaced and the +/// length of the replacement. +/// +/// Returns the range in the new source that was ultimately reparsed. +pub fn reparse( + root: &mut SyntaxNode, + text: &str, + replaced: Range<usize>, + replacement_len: usize, +) -> Range<usize> { + if let SyntaxNode::Inner(inner) = root { + let change = Change { text, replaced, replacement_len }; + if let Some(range) = try_reparse(&change, Arc::make_mut(inner), 0, true, true) { + return range; + } + } + + let id = root.span().source(); + *root = parse(text); + root.numberize(id, Span::FULL).unwrap(); + 0 .. text.len() +} + +/// Try to reparse inside the given node. +fn try_reparse( + change: &Change, + node: &mut InnerNode, + mut offset: usize, + outermost: bool, + safe_to_replace: bool, +) -> Option<Range<usize>> { + let is_markup = matches!(node.kind(), NodeKind::Markup { .. }); + let original_count = node.children().len(); + let original_offset = offset; + + let mut search = SearchState::default(); + let mut ahead: Option<Ahead> = None; + + // Whether the first node that should be replaced is at start. + let mut at_start = true; + + // Whether the last searched child is the outermost child. + let mut child_outermost = false; + + // Find the the first child in the range of children to reparse. + for (i, child) in node.children().enumerate() { + let pos = NodePos { idx: i, offset }; + let child_span = offset .. offset + child.len(); + child_outermost = outermost && i + 1 == original_count; + + match search { + SearchState::NoneFound => { + // The edit is contained within the span of the current element. + if child_span.contains(&change.replaced.start) + && child_span.end >= change.replaced.end + { + // In Markup mode, we want to consider a non-whitespace + // neighbor if the edit is on the node boundary. + search = if is_markup && child_span.end == change.replaced.end { + SearchState::RequireNonTrivia(pos) + } else { + SearchState::Contained(pos) + }; + } else if child_span.contains(&change.replaced.start) { + search = SearchState::Inside(pos); + } else if child_span.end == change.replaced.start + && change.replaced.start == change.replaced.end + && child_outermost + { + search = SearchState::SpanFound(pos, pos); + } else { + // Update compulsary state of `ahead_nontrivia`. + if let Some(ahead_nontrivia) = ahead.as_mut() { + if let NodeKind::Space { newlines: (1 ..) } = child.kind() { + ahead_nontrivia.newline(); + } + } + + // We look only for non spaces, non-semicolon and also + // reject text that points to the special case for URL + // evasion and line comments. + if !child.kind().is_space() + && child.kind() != &NodeKind::Semicolon + && child.kind() != &NodeKind::Text('/'.into()) + && (ahead.is_none() || change.replaced.start > child_span.end) + && !ahead.map_or(false, Ahead::is_compulsory) + { + ahead = Some(Ahead::new(pos, at_start, is_bounded(child.kind()))); + } + + at_start = next_at_start(child.kind(), at_start); + } + } + SearchState::Inside(start) => { + if child_span.end == change.replaced.end { + search = SearchState::RequireNonTrivia(start); + } else if child_span.end > change.replaced.end { + search = SearchState::SpanFound(start, pos); + } + } + SearchState::RequireNonTrivia(start) => { + if !child.kind().is_trivia() { + search = SearchState::SpanFound(start, pos); + } + } + _ => unreachable!(), + } + + offset += child.len(); + + if search.done().is_some() { + break; + } + } + + // If we were looking for a non-whitespace element and hit the end of + // the file here, we instead use EOF as the end of the span. + if let SearchState::RequireNonTrivia(start) = search { + search = SearchState::SpanFound(start, NodePos { + idx: node.children().len() - 1, + offset: offset - node.children().last().unwrap().len(), + }) + } + + if let SearchState::Contained(pos) = search { + // Do not allow replacement of elements inside of constructs whose + // opening and closing brackets look the same. + let safe_inside = is_bounded(node.kind()); + let child = &mut node.children_mut()[pos.idx]; + let prev_len = child.len(); + let prev_descendants = child.descendants(); + + if let Some(range) = match child { + SyntaxNode::Inner(node) => try_reparse( + change, + Arc::make_mut(node), + pos.offset, + child_outermost, + safe_inside, + ), + SyntaxNode::Leaf(_) => None, + } { + let new_len = child.len(); + let new_descendants = child.descendants(); + node.update_parent(prev_len, new_len, prev_descendants, new_descendants); + return Some(range); + } + + let superseded_span = pos.offset .. pos.offset + prev_len; + let func: Option<ReparseMode> = match child.kind() { + NodeKind::CodeBlock => Some(ReparseMode::Code), + NodeKind::ContentBlock => Some(ReparseMode::Content), + _ => None, + }; + + // Return if the element was reparsable on its own, otherwise try to + // treat it as a markup element. + if let Some(func) = func { + if let Some(result) = replace( + change, + node, + func, + pos.idx .. pos.idx + 1, + superseded_span, + outermost, + ) { + return Some(result); + } + } + } + + // Make sure this is a markup node and that we may replace. If so, save + // the current indent. + let min_indent = match node.kind() { + NodeKind::Markup { min_indent } if safe_to_replace => *min_indent, + _ => return None, + }; + + let (mut start, end) = search.done()?; + if let Some(ahead) = ahead { + if start.offset == change.replaced.start || ahead.is_compulsory() { + start = ahead.pos; + at_start = ahead.at_start; + } + } else { + start = NodePos { idx: 0, offset: original_offset }; + } + + let superseded_span = + start.offset .. end.offset + node.children().as_slice()[end.idx].len(); + + replace( + change, + node, + ReparseMode::MarkupElements { at_start, min_indent }, + start.idx .. end.idx + 1, + superseded_span, + outermost, + ) +} + +/// Reparse the superseded nodes and replace them. +fn replace( + change: &Change, + node: &mut InnerNode, + mode: ReparseMode, + superseded_idx: Range<usize>, + superseded_span: Range<usize>, + outermost: bool, +) -> Option<Range<usize>> { + let superseded_start = superseded_idx.start; + + let differential: isize = + change.replacement_len as isize - change.replaced.len() as isize; + let newborn_end = (superseded_span.end as isize + differential) as usize; + let newborn_span = superseded_span.start .. newborn_end; + + let mut prefix = ""; + for (i, c) in change.text[.. newborn_span.start].char_indices().rev() { + if is_newline(c) { + break; + } + prefix = &change.text[i .. newborn_span.start]; + } + + let (newborns, terminated, amount) = match mode { + ReparseMode::Code => reparse_code_block( + &prefix, + &change.text[newborn_span.start ..], + newborn_span.len(), + ), + ReparseMode::Content => reparse_content_block( + &prefix, + &change.text[newborn_span.start ..], + newborn_span.len(), + ), + ReparseMode::MarkupElements { at_start, min_indent } => reparse_markup_elements( + &prefix, + &change.text[newborn_span.start ..], + newborn_span.len(), + differential, + &node.children().as_slice()[superseded_start ..], + at_start, + min_indent, + ), + }?; + + // Do not accept unclosed nodes if the old node wasn't at the right edge + // of the tree. + if !outermost && !terminated { + return None; + } + + node.replace_children(superseded_start .. superseded_start + amount, newborns) + .ok()?; + + Some(newborn_span) +} + +/// A description of a change. +struct Change<'a> { + /// The new source code, with the change applied. + text: &'a str, + /// Which range in the old source file was changed. + replaced: Range<usize>, + /// How many characters replaced the text in `replaced`. + replacement_len: usize, +} + +/// Encodes the state machine of the search for the nodes are pending for +/// replacement. +#[derive(Clone, Copy, Debug, PartialEq)] +enum SearchState { + /// Neither an end nor a start have been found as of now. + /// The latest non-trivia child is continually saved. + NoneFound, + /// The search has concluded by finding a node that fully contains the + /// modifications. + Contained(NodePos), + /// The search has found the start of the modified nodes. + Inside(NodePos), + /// The search has found the end of the modified nodes but the change + /// touched its boundries so another non-trivia node is needed. + RequireNonTrivia(NodePos), + /// The search has concluded by finding a start and an end index for nodes + /// with a pending reparse. + SpanFound(NodePos, NodePos), +} + +impl Default for SearchState { + fn default() -> Self { + Self::NoneFound + } +} + +impl SearchState { + fn done(self) -> Option<(NodePos, NodePos)> { + match self { + Self::NoneFound => None, + Self::Contained(s) => Some((s, s)), + Self::Inside(_) => None, + Self::RequireNonTrivia(_) => None, + Self::SpanFound(s, e) => Some((s, e)), + } + } +} + +/// The position of a syntax node. +#[derive(Clone, Copy, Debug, PartialEq)] +struct NodePos { + /// The index in the parent node. + idx: usize, + /// The byte offset in the string. + offset: usize, +} + +/// An ahead node with an index and whether it is `at_start`. +#[derive(Clone, Copy, Debug, PartialEq)] +struct Ahead { + /// The position of the node. + pos: NodePos, + /// The `at_start` before this node. + at_start: bool, + /// The kind of ahead node. + kind: AheadKind, +} + +/// The kind of ahead node. +#[derive(Clone, Copy, Debug, PartialEq)] +enum AheadKind { + /// A normal non-trivia child has been found. + Normal, + /// An unbounded child has been found. The boolean indicates whether it was + /// on the current line, in which case adding it to the reparsing range is + /// compulsory. + Unbounded(bool), +} + +impl Ahead { + fn new(pos: NodePos, at_start: bool, bounded: bool) -> Self { + Self { + pos, + at_start, + kind: if bounded { + AheadKind::Normal + } else { + AheadKind::Unbounded(true) + }, + } + } + + fn newline(&mut self) { + if let AheadKind::Unbounded(current_line) = &mut self.kind { + *current_line = false; + } + } + + fn is_compulsory(self) -> bool { + matches!(self.kind, AheadKind::Unbounded(true)) + } +} + +/// Which reparse function to choose for a span of elements. +#[derive(Clone, Copy, Debug, PartialEq)] +enum ReparseMode { + /// Reparse a code block, including its braces. + Code, + /// Reparse a content block, including its square brackets. + Content, + /// Reparse elements of the markup. Also specified the initial `at_start` + /// state for the reparse and the minimum indent of the reparsed nodes. + MarkupElements { at_start: bool, min_indent: usize }, +} + +/// Whether changes _inside_ this node are safely encapsulated, so that only +/// this node must be reparsed. +fn is_bounded(kind: &NodeKind) -> bool { + match kind { + NodeKind::CodeBlock + | NodeKind::ContentBlock + | NodeKind::Linebreak + | NodeKind::SmartQuote { .. } + | NodeKind::BlockComment + | NodeKind::Space { .. } + | NodeKind::Escape(_) + | NodeKind::Shorthand(_) => true, + _ => false, + } +} + +/// Whether `at_start` would still be true after this node given the +/// previous value of the property. +fn next_at_start(kind: &NodeKind, prev: bool) -> bool { + match kind { + NodeKind::Space { newlines: (1 ..) } => true, + NodeKind::Space { .. } | NodeKind::LineComment | NodeKind::BlockComment => prev, + _ => false, + } +} + +#[cfg(test)] +#[rustfmt::skip] +mod tests { + use super::*; + use super::super::{parse, Source}; + use super::super::tests::check; + + #[track_caller] + fn test(prev: &str, range: Range<usize>, with: &str, goal: Range<usize>) { + let mut source = Source::detached(prev); + let range = source.edit(range, with); + check(source.text(), source.root(), &parse(source.text())); + assert_eq!(range, goal); + } + + #[test] + fn test_parse_incremental_simple_replacements() { + test("hello world", 7 .. 12, "walkers", 0 .. 14); + test("some content", 0..12, "", 0..0); + test("", 0..0, "do it", 0..5); + test("a d e", 1 .. 3, " b c d", 0 .. 9); + test("*~ *", 2..2, "*", 0..5); + test("_1_\n2a\n3", 5..5, "4", 4..7); + test("_1_\n2a\n3~", 8..8, "4", 4..10); + test("_1_ 2 3a\n4", 7..7, "5", 0..9); + test("* {1+2} *", 5..6, "3", 2..7); + test("a #f() e", 1 .. 6, " b c d", 0 .. 9); + test("a\nb\nc\nd\ne\n", 5 .. 5, "c", 2 .. 7); + test("a\n\nb\n\nc\n\nd\n\ne\n", 7 .. 7, "c", 3 .. 10); + test("a\nb\nc *hel a b lo* d\nd\ne", 13..13, "c ", 4..20); + test("~~ {a} ~~", 4 .. 5, "b", 3 .. 6); + test("{(0, 1, 2)}", 5 .. 6, "11pt", 0..14); + test("\n= A heading", 4 .. 4, "n evocative", 0 .. 23); + test("for~your~thing", 9 .. 9, "a", 0 .. 15); + test("a your thing a", 6 .. 7, "a", 0 .. 14); + test("{call(); abc}", 7 .. 7, "[]", 0 .. 15); + test("#call() abc", 7 .. 7, "[]", 0 .. 10); + test("hi[\n- item\n- item 2\n - item 3]", 11 .. 11, " ", 2 .. 35); + test("hi\n- item\nno item\n - item 3", 10 .. 10, "- ", 3..19); + test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 16 .. 20, "none", 0..99); + test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 33 .. 42, "[_gronk_]", 33..42); + test("#grid(columns: (auto, 1fr, 40%), [*plonk*], rect(width: 100%, height: 1pt, fill: conifer), [thing])", 34 .. 41, "_bar_", 33 .. 40); + test("{let i=1; for x in range(5) {i}}", 6 .. 6, " ", 0 .. 33); + test("{let i=1; for x in range(5) {i}}", 13 .. 14, " ", 0 .. 33); + test("hello~~{x}", 7 .. 10, "#f()", 0 .. 11); + test("this~is -- in my opinion -- spectacular", 8 .. 10, "---", 0 .. 25); + test("understanding `code` is complicated", 15 .. 15, "C ", 0 .. 22); + test("{ let x = g() }", 10 .. 12, "f(54", 0 .. 17); + test(r#"a ```typst hello``` b"#, 16 .. 17, "", 0 .. 18); + test(r#"a ```typst hello```"#, 16 .. 17, "", 0 .. 18); + test("#for", 4 .. 4, "//", 0 .. 6); + test("#show a: f as b..", 16..16, "c", 0..18); + test("a\n#let \nb", 7 .. 7, "i", 2 .. 9); + test("a\n#for i \nb", 9 .. 9, "in", 2 .. 12); + test("a~https://fun/html", 13..14, "n", 0..18); + } + + #[test] + fn test_parse_incremental_whitespace_invariants() { + test("hello \\ world", 7 .. 8, "a ", 0 .. 14); + test("hello \\ world", 7 .. 8, " a", 0 .. 14); + test("x = y", 1 .. 1, " + y", 0 .. 6); + test("x = y", 1 .. 1, " + y\n", 0 .. 7); + test("abc\n= a heading\njoke", 3 .. 4, "\nmore\n\n", 0 .. 21); + test("abc\n= a heading\njoke", 3 .. 4, "\nnot ", 0 .. 19); + test("#let x = (1, 2 + ;~ Five\r\n\r", 20 .. 23, "2.", 0 .. 23); + test("hey #myfriend", 4 .. 4, "\\", 0 .. 14); + test("hey #myfriend", 4 .. 4, "\\", 0 .. 6); + test("= foo\nbar\n - a\n - b", 6 .. 9, "", 0 .. 11); + test("= foo\n bar\n baz", 6 .. 8, "", 0 .. 9); + test(" // hi", 1 .. 1, " ", 0 .. 7); + test("- \nA", 2..3, "", 0..3); + } + + #[test] + fn test_parse_incremental_type_invariants() { + test("a #for x in array {x}", 18 .. 21, "[#x]", 0 .. 22); + test("a #let x = 1 {5}", 3 .. 6, "if", 0 .. 11); + test("a {let x = 1 {5}} b", 3 .. 6, "if", 2 .. 16); + test("#let x = 1 {5}", 4 .. 4, " if", 0 .. 13); + test("{let x = 1 {5}}", 4 .. 4, " if", 0 .. 18); + test("a // b c #f()", 3 .. 4, "", 0 .. 12); + test("{\nf()\n//g(a)\n}", 6 .. 8, "", 0 .. 12); + test("a{\nf()\n//g(a)\n}b", 7 .. 9, "", 1 .. 13); + test("a #while x {\n g(x) \n} b", 11 .. 11, "//", 0 .. 26); + test("{(1, 2)}", 1 .. 1, "while ", 0 .. 14); + test("a b c", 1 .. 1, "{[}", 0 .. 8); + } + + #[test] + fn test_parse_incremental_wrongly_or_unclosed_things() { + test(r#"{"hi"}"#, 4 .. 5, "c", 0 .. 6); + test(r"this \u{abcd}", 8 .. 9, "", 0 .. 12); + test(r"this \u{abcd} that", 12 .. 13, "", 0 .. 17); + test(r"{{let x = z}; a = 1} b", 6 .. 6, "//", 0 .. 24); + test("a b c", 1 .. 1, " /* letters */", 0 .. 19); + test("a b c", 1 .. 1, " /* letters", 0 .. 16); + test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters */", 0 .. 41); + test("{if i==1 {a} else [b]; b()}", 12 .. 12, " /* letters", 0 .. 38); + test("~~~~", 2 .. 2, "[]", 0 .. 5); + test("a[]b", 2 .. 2, "{", 1 .. 4); + test("[hello]", 2 .. 3, "]", 0 .. 7); + test("{a}", 1 .. 2, "b", 0 .. 3); + test("{ a; b; c }", 5 .. 6, "[}]", 0 .. 13); + test("#a()\n~", 3..4, "{}", 0..7); + test("[]\n~", 1..2, "#if i==0 {true}", 0..18); + } +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 8b172def..1a23db5f 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,558 +1,41 @@ -//! Syntax types. +//! Syntax definition, parsing, and highlighting. pub mod ast; pub mod highlight; +mod incremental; mod kind; +mod node; +mod parser; +mod parsing; +mod resolve; +mod source; mod span; - -use std::fmt::{self, Debug, Formatter}; -use std::ops::Range; -use std::sync::Arc; +mod tokens; pub use kind::*; +pub use node::*; +pub use parsing::*; +pub use source::*; pub use span::*; +pub use tokens::*; -use self::ast::TypedNode; -use crate::diag::SourceError; -use crate::source::SourceId; - -/// An inner or leaf node in the untyped syntax tree. -#[derive(Clone, PartialEq, Hash)] -pub enum SyntaxNode { - /// A reference-counted inner node. - Inner(Arc<InnerNode>), - /// A leaf token. - Leaf(NodeData), -} - -impl SyntaxNode { - /// The metadata of the node. - pub fn data(&self) -> &NodeData { - match self { - Self::Inner(inner) => &inner.data, - Self::Leaf(leaf) => leaf, - } - } - - /// The type of the node. - pub fn kind(&self) -> &NodeKind { - self.data().kind() - } - - /// The length of the node. - pub fn len(&self) -> usize { - self.data().len() - } - - /// The number of descendants, including the node itself. - pub fn descendants(&self) -> usize { - match self { - Self::Inner(inner) => inner.descendants(), - Self::Leaf(_) => 1, - } - } - - /// The span of the node. - pub fn span(&self) -> Span { - self.data().span() - } - - /// Whether the node or its children contain an error. - pub fn erroneous(&self) -> bool { - match self { - Self::Inner(node) => node.erroneous, - Self::Leaf(data) => data.kind.is_error(), - } - } +use incremental::reparse; +use parser::*; - /// The error messages for this node and its descendants. - pub fn errors(&self) -> Vec<SourceError> { - if !self.erroneous() { - return vec![]; - } +#[cfg(test)] +mod tests { + use std::fmt::Debug; - match self.kind() { - NodeKind::Error(pos, message) => { - vec![SourceError::new(self.span(), message.clone()).with_pos(*pos)] - } - _ => self - .children() - .filter(|node| node.erroneous()) - .flat_map(|node| node.errors()) - .collect(), - } - } - - /// The node's children. - pub fn children(&self) -> std::slice::Iter<'_, SyntaxNode> { - match self { - Self::Inner(inner) => inner.children(), - Self::Leaf(_) => [].iter(), - } - } - - /// Convert the node to a typed AST node. - pub fn cast<T>(&self) -> Option<T> + #[track_caller] + pub fn check<T>(text: &str, found: T, expected: T) where - T: TypedNode, + T: Debug + PartialEq, { - T::from_untyped(self) - } - - /// Get the first child that can cast to the AST type `T`. - pub fn cast_first_child<T: TypedNode>(&self) -> Option<T> { - self.children().find_map(Self::cast) - } - - /// Get the last child that can cast to the AST type `T`. - pub fn cast_last_child<T: TypedNode>(&self) -> Option<T> { - self.children().rev().find_map(Self::cast) - } - - /// Change the type of the node. - pub fn convert(&mut self, kind: NodeKind) { - match self { - Self::Inner(inner) => { - let node = Arc::make_mut(inner); - node.erroneous |= kind.is_error(); - node.data.kind = kind; - } - Self::Leaf(leaf) => leaf.kind = kind, - } - } - - /// Set a synthetic span for the node and all its descendants. - pub fn synthesize(&mut self, span: Span) { - match self { - Self::Inner(inner) => Arc::make_mut(inner).synthesize(span), - Self::Leaf(leaf) => leaf.synthesize(span), - } - } - - /// Assign spans to each node. - pub fn numberize(&mut self, id: SourceId, within: Range<u64>) -> NumberingResult { - match self { - Self::Inner(inner) => Arc::make_mut(inner).numberize(id, None, within), - Self::Leaf(leaf) => leaf.numberize(id, within), - } - } - - /// The upper bound of assigned numbers in this subtree. - pub fn upper(&self) -> u64 { - match self { - Self::Inner(inner) => inner.upper(), - Self::Leaf(leaf) => leaf.span().number() + 1, - } - } - - /// If the span points into this node, convert it to a byte range. - pub fn range(&self, span: Span, offset: usize) -> Option<Range<usize>> { - match self { - Self::Inner(inner) => inner.range(span, offset), - Self::Leaf(leaf) => leaf.range(span, offset), - } - } - - /// Returns all leaf descendants of this node (may include itself). - /// - /// This method is slow and only intended for testing. - pub fn leafs(&self) -> Vec<Self> { - if match self { - Self::Inner(inner) => inner.children.is_empty(), - Self::Leaf(_) => true, - } { - vec![self.clone()] - } else { - self.children().flat_map(Self::leafs).collect() - } - } -} - -impl Default for SyntaxNode { - fn default() -> Self { - Self::Leaf(NodeData::new(NodeKind::None, 0)) - } -} - -impl Debug for SyntaxNode { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - match self { - Self::Inner(node) => node.fmt(f), - Self::Leaf(token) => token.fmt(f), - } - } -} - -/// An inner node in the untyped syntax tree. -#[derive(Clone, Hash)] -pub struct InnerNode { - /// Node metadata. - data: NodeData, - /// The number of nodes in the whole subtree, including this node. - descendants: usize, - /// Whether this node or any of its children are erroneous. - erroneous: bool, - /// The upper bound of this node's numbering range. - upper: u64, - /// This node's children, losslessly make up this node. - children: Vec<SyntaxNode>, -} - -impl InnerNode { - /// Creates a new node with the given kind and a single child. - pub fn with_child(kind: NodeKind, child: impl Into<SyntaxNode>) -> Self { - Self::with_children(kind, vec![child.into()]) - } - - /// Creates a new node with the given kind and children. - pub fn with_children(kind: NodeKind, children: Vec<SyntaxNode>) -> Self { - let mut len = 0; - let mut descendants = 1; - let mut erroneous = kind.is_error(); - - for child in &children { - len += child.len(); - descendants += child.descendants(); - erroneous |= child.erroneous(); - } - - Self { - data: NodeData::new(kind, len), - descendants, - erroneous, - upper: 0, - children, - } - } - - /// The node's metadata. - pub fn data(&self) -> &NodeData { - &self.data - } - - /// The node's type. - pub fn kind(&self) -> &NodeKind { - self.data().kind() - } - - /// The node's length. - pub fn len(&self) -> usize { - self.data().len() - } - - /// The node's span. - pub fn span(&self) -> Span { - self.data().span() - } - - /// The number of descendants, including the node itself. - pub fn descendants(&self) -> usize { - self.descendants - } - - /// The node's children. - pub fn children(&self) -> std::slice::Iter<'_, SyntaxNode> { - self.children.iter() - } - - /// Set a synthetic span for the node and all its descendants. - pub fn synthesize(&mut self, span: Span) { - self.data.synthesize(span); - for child in &mut self.children { - child.synthesize(span); - } - } - - /// Assign span numbers `within` an interval to this node's subtree or just - /// a `range` of its children. - pub fn numberize( - &mut self, - id: SourceId, - range: Option<Range<usize>>, - within: Range<u64>, - ) -> NumberingResult { - // Determine how many nodes we will number. - let descendants = match &range { - Some(range) if range.is_empty() => return Ok(()), - Some(range) => self.children[range.clone()] - .iter() - .map(SyntaxNode::descendants) - .sum::<usize>(), - None => self.descendants, - }; - - // Determine the distance between two neighbouring assigned numbers. If - // possible, we try to fit all numbers into the left half of `within` - // so that there is space for future insertions. - let space = within.end - within.start; - let mut stride = space / (2 * descendants as u64); - if stride == 0 { - stride = space / self.descendants as u64; - if stride == 0 { - return Err(Unnumberable); - } - } - - // Number this node itself. - let mut start = within.start; - if range.is_none() { - let end = start + stride; - self.data.numberize(id, start .. end)?; - self.upper = within.end; - start = end; - } - - // Number the children. - let len = self.children.len(); - for child in &mut self.children[range.unwrap_or(0 .. len)] { - let end = start + child.descendants() as u64 * stride; - child.numberize(id, start .. end)?; - start = end; - } - - Ok(()) - } - - /// The upper bound of assigned numbers in this subtree. - pub fn upper(&self) -> u64 { - self.upper - } - - /// If the span points into this node, convert it to a byte range. - pub fn range(&self, span: Span, mut offset: usize) -> Option<Range<usize>> { - // Check whether we found it. - if let Some(range) = self.data.range(span, offset) { - return Some(range); - } - - // The parent of a subtree has a smaller span number than all of its - // descendants. Therefore, we can bail out early if the target span's - // number is smaller than our number. - if span.number() < self.span().number() { - return None; - } - - let mut children = self.children.iter().peekable(); - while let Some(child) = children.next() { - // Every node in this child's subtree has a smaller span number than - // the next sibling. Therefore we only need to recurse if the next - // sibling's span number is larger than the target span's number. - if children - .peek() - .map_or(true, |next| next.span().number() > span.number()) - { - if let Some(range) = child.range(span, offset) { - return Some(range); - } - } - - offset += child.len(); - } - - None - } - - /// The node's children, mutably. - pub(crate) fn children_mut(&mut self) -> &mut [SyntaxNode] { - &mut self.children - } - - /// Replaces a range of children with a replacement. - /// - /// May have mutated the children if it returns `Err(_)`. - pub(crate) fn replace_children( - &mut self, - mut range: Range<usize>, - replacement: Vec<SyntaxNode>, - ) -> NumberingResult { - let superseded = &self.children[range.clone()]; - - // Compute the new byte length. - self.data.len = self.data.len - + replacement.iter().map(SyntaxNode::len).sum::<usize>() - - superseded.iter().map(SyntaxNode::len).sum::<usize>(); - - // Compute the new number of descendants. - self.descendants = self.descendants - + replacement.iter().map(SyntaxNode::descendants).sum::<usize>() - - superseded.iter().map(SyntaxNode::descendants).sum::<usize>(); - - // Determine whether we're still erroneous after the replacement. That's - // the case if - // - any of the new nodes is erroneous, - // - or if we were erroneous before due to a non-superseded node. - self.erroneous = replacement.iter().any(SyntaxNode::erroneous) - || (self.erroneous - && (self.children[.. range.start].iter().any(SyntaxNode::erroneous)) - || self.children[range.end ..].iter().any(SyntaxNode::erroneous)); - - // Perform the replacement. - let replacement_count = replacement.len(); - self.children.splice(range.clone(), replacement); - range.end = range.start + replacement_count; - - // Renumber the new children. Retries until it works, taking - // exponentially more children into account. - let mut left = 0; - let mut right = 0; - let max_left = range.start; - let max_right = self.children.len() - range.end; - loop { - let renumber = range.start - left .. range.end + right; - - // The minimum assignable number is either - // - the upper bound of the node right before the to-be-renumbered - // children, - // - or this inner node's span number plus one if renumbering starts - // at the first child. - let start_number = renumber - .start - .checked_sub(1) - .and_then(|i| self.children.get(i)) - .map_or(self.span().number() + 1, |child| child.upper()); - - // The upper bound for renumbering is either - // - the span number of the first child after the to-be-renumbered - // children, - // - or this node's upper bound if renumbering ends behind the last - // child. - let end_number = self - .children - .get(renumber.end) - .map_or(self.upper(), |next| next.span().number()); - - // Try to renumber. - let within = start_number .. end_number; - let id = self.span().source(); - if self.numberize(id, Some(renumber), within).is_ok() { - return Ok(()); - } - - // If it didn't even work with all children, we give up. - if left == max_left && right == max_right { - return Err(Unnumberable); - } - - // Exponential expansion to both sides. - left = (left + 1).next_power_of_two().min(max_left); - right = (right + 1).next_power_of_two().min(max_right); - } - } - - /// Update this node after changes were made to one of its children. - pub(crate) fn update_parent( - &mut self, - prev_len: usize, - new_len: usize, - prev_descendants: usize, - new_descendants: usize, - ) { - self.data.len = self.data.len + new_len - prev_len; - self.descendants = self.descendants + new_descendants - prev_descendants; - self.erroneous = self.children.iter().any(SyntaxNode::erroneous); - } -} - -impl From<InnerNode> for SyntaxNode { - fn from(node: InnerNode) -> Self { - Arc::new(node).into() - } -} - -impl From<Arc<InnerNode>> for SyntaxNode { - fn from(node: Arc<InnerNode>) -> Self { - Self::Inner(node) - } -} - -impl Debug for InnerNode { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - self.data.fmt(f)?; - if !self.children.is_empty() { - f.write_str(" ")?; - f.debug_list().entries(&self.children).finish()?; + if found != expected { + println!("source: {text:?}"); + println!("expected: {expected:#?}"); + println!("found: {found:#?}"); + panic!("test failed"); } - Ok(()) - } -} - -impl PartialEq for InnerNode { - fn eq(&self, other: &Self) -> bool { - self.data == other.data - && self.descendants == other.descendants - && self.erroneous == other.erroneous - && self.children == other.children - } -} - -/// Data shared between inner and leaf nodes. -#[derive(Clone, Hash)] -pub struct NodeData { - /// What kind of node this is (each kind would have its own struct in a - /// strongly typed AST). - kind: NodeKind, - /// The byte length of the node in the source. - len: usize, - /// The node's span. - span: Span, -} - -impl NodeData { - /// Create new node metadata. - pub fn new(kind: NodeKind, len: usize) -> Self { - Self { len, kind, span: Span::detached() } - } - - /// The node's type. - pub fn kind(&self) -> &NodeKind { - &self.kind - } - - /// The node's length. - pub fn len(&self) -> usize { - self.len - } - - /// The node's span. - pub fn span(&self) -> Span { - self.span - } - - /// Set a synthetic span for the node. - pub fn synthesize(&mut self, span: Span) { - self.span = span; - } - - /// Assign a span to the node. - pub fn numberize(&mut self, id: SourceId, within: Range<u64>) -> NumberingResult { - if within.start < within.end { - self.span = Span::new(id, (within.start + within.end) / 2); - Ok(()) - } else { - Err(Unnumberable) - } - } - - /// If the span points into this node, convert it to a byte range. - pub fn range(&self, span: Span, offset: usize) -> Option<Range<usize>> { - (self.span == span).then(|| offset .. offset + self.len()) - } -} - -impl From<NodeData> for SyntaxNode { - fn from(token: NodeData) -> Self { - Self::Leaf(token) - } -} - -impl Debug for NodeData { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "{:?}: {}", self.kind, self.len) - } -} - -impl PartialEq for NodeData { - fn eq(&self, other: &Self) -> bool { - self.kind == other.kind && self.len == other.len } } diff --git a/src/syntax/node.rs b/src/syntax/node.rs new file mode 100644 index 00000000..6a7d424a --- /dev/null +++ b/src/syntax/node.rs @@ -0,0 +1,548 @@ +use std::fmt::{self, Debug, Formatter}; +use std::ops::Range; +use std::sync::Arc; + +use super::ast::TypedNode; +use super::{NodeKind, NumberingResult, SourceId, Span, Unnumberable}; +use crate::diag::SourceError; + +/// An inner or leaf node in the untyped syntax tree. +#[derive(Clone, PartialEq, Hash)] +pub enum SyntaxNode { + /// A reference-counted inner node. + Inner(Arc<InnerNode>), + /// A leaf token. + Leaf(NodeData), +} + +impl SyntaxNode { + /// The metadata of the node. + pub fn data(&self) -> &NodeData { + match self { + Self::Inner(inner) => &inner.data, + Self::Leaf(leaf) => leaf, + } + } + + /// The type of the node. + pub fn kind(&self) -> &NodeKind { + self.data().kind() + } + + /// The length of the node. + pub fn len(&self) -> usize { + self.data().len() + } + + /// The number of descendants, including the node itself. + pub fn descendants(&self) -> usize { + match self { + Self::Inner(inner) => inner.descendants(), + Self::Leaf(_) => 1, + } + } + + /// The span of the node. + pub fn span(&self) -> Span { + self.data().span() + } + + /// Whether the node or its children contain an error. + pub fn erroneous(&self) -> bool { + match self { + Self::Inner(node) => node.erroneous, + Self::Leaf(data) => data.kind.is_error(), + } + } + + /// The error messages for this node and its descendants. + pub fn errors(&self) -> Vec<SourceError> { + if !self.erroneous() { + return vec![]; + } + + match self.kind() { + NodeKind::Error(pos, message) => { + vec![SourceError::new(self.span(), message.clone()).with_pos(*pos)] + } + _ => self + .children() + .filter(|node| node.erroneous()) + .flat_map(|node| node.errors()) + .collect(), + } + } + + /// The node's children. + pub fn children(&self) -> std::slice::Iter<'_, SyntaxNode> { + match self { + Self::Inner(inner) => inner.children(), + Self::Leaf(_) => [].iter(), + } + } + + /// Convert the node to a typed AST node. + pub fn cast<T>(&self) -> Option<T> + where + T: TypedNode, + { + T::from_untyped(self) + } + + /// Get the first child that can cast to the AST type `T`. + pub fn cast_first_child<T: TypedNode>(&self) -> Option<T> { + self.children().find_map(Self::cast) + } + + /// Get the last child that can cast to the AST type `T`. + pub fn cast_last_child<T: TypedNode>(&self) -> Option<T> { + self.children().rev().find_map(Self::cast) + } + + /// Change the type of the node. + pub fn convert(&mut self, kind: NodeKind) { + match self { + Self::Inner(inner) => { + let node = Arc::make_mut(inner); + node.erroneous |= kind.is_error(); + node.data.kind = kind; + } + Self::Leaf(leaf) => leaf.kind = kind, + } + } + + /// Set a synthetic span for the node and all its descendants. + pub fn synthesize(&mut self, span: Span) { + match self { + Self::Inner(inner) => Arc::make_mut(inner).synthesize(span), + Self::Leaf(leaf) => leaf.synthesize(span), + } + } + + /// Assign spans to each node. + pub fn numberize(&mut self, id: SourceId, within: Range<u64>) -> NumberingResult { + match self { + Self::Inner(inner) => Arc::make_mut(inner).numberize(id, None, within), + Self::Leaf(leaf) => leaf.numberize(id, within), + } + } + + /// The upper bound of assigned numbers in this subtree. + pub fn upper(&self) -> u64 { + match self { + Self::Inner(inner) => inner.upper(), + Self::Leaf(leaf) => leaf.span().number() + 1, + } + } + + /// If the span points into this node, convert it to a byte range. + pub fn range(&self, span: Span, offset: usize) -> Option<Range<usize>> { + match self { + Self::Inner(inner) => inner.range(span, offset), + Self::Leaf(leaf) => leaf.range(span, offset), + } + } + + /// Returns all leaf descendants of this node (may include itself). + /// + /// This method is slow and only intended for testing. + pub fn leafs(&self) -> Vec<Self> { + if match self { + Self::Inner(inner) => inner.children.is_empty(), + Self::Leaf(_) => true, + } { + vec![self.clone()] + } else { + self.children().flat_map(Self::leafs).collect() + } + } +} + +impl Default for SyntaxNode { + fn default() -> Self { + Self::Leaf(NodeData::new(NodeKind::None, 0)) + } +} + +impl Debug for SyntaxNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + match self { + Self::Inner(node) => node.fmt(f), + Self::Leaf(token) => token.fmt(f), + } + } +} + +/// An inner node in the untyped syntax tree. +#[derive(Clone, Hash)] +pub struct InnerNode { + /// Node metadata. + data: NodeData, + /// The number of nodes in the whole subtree, including this node. + descendants: usize, + /// Whether this node or any of its children are erroneous. + erroneous: bool, + /// The upper bound of this node's numbering range. + upper: u64, + /// This node's children, losslessly make up this node. + children: Vec<SyntaxNode>, +} + +impl InnerNode { + /// Creates a new node with the given kind and a single child. + pub fn with_child(kind: NodeKind, child: impl Into<SyntaxNode>) -> Self { + Self::with_children(kind, vec![child.into()]) + } + + /// Creates a new node with the given kind and children. + pub fn with_children(kind: NodeKind, children: Vec<SyntaxNode>) -> Self { + let mut len = 0; + let mut descendants = 1; + let mut erroneous = kind.is_error(); + + for child in &children { + len += child.len(); + descendants += child.descendants(); + erroneous |= child.erroneous(); + } + + Self { + data: NodeData::new(kind, len), + descendants, + erroneous, + upper: 0, + children, + } + } + + /// The node's metadata. + pub fn data(&self) -> &NodeData { + &self.data + } + + /// The node's type. + pub fn kind(&self) -> &NodeKind { + self.data().kind() + } + + /// The node's length. + pub fn len(&self) -> usize { + self.data().len() + } + + /// The node's span. + pub fn span(&self) -> Span { + self.data().span() + } + + /// The number of descendants, including the node itself. + pub fn descendants(&self) -> usize { + self.descendants + } + + /// The node's children. + pub fn children(&self) -> std::slice::Iter<'_, SyntaxNode> { + self.children.iter() + } + + /// Set a synthetic span for the node and all its descendants. + pub fn synthesize(&mut self, span: Span) { + self.data.synthesize(span); + for child in &mut self.children { + child.synthesize(span); + } + } + + /// Assign span numbers `within` an interval to this node's subtree or just + /// a `range` of its children. + pub fn numberize( + &mut self, + id: SourceId, + range: Option<Range<usize>>, + within: Range<u64>, + ) -> NumberingResult { + // Determine how many nodes we will number. + let descendants = match &range { + Some(range) if range.is_empty() => return Ok(()), + Some(range) => self.children[range.clone()] + .iter() + .map(SyntaxNode::descendants) + .sum::<usize>(), + None => self.descendants, + }; + + // Determine the distance between two neighbouring assigned numbers. If + // possible, we try to fit all numbers into the left half of `within` + // so that there is space for future insertions. + let space = within.end - within.start; + let mut stride = space / (2 * descendants as u64); + if stride == 0 { + stride = space / self.descendants as u64; + if stride == 0 { + return Err(Unnumberable); + } + } + + // Number this node itself. + let mut start = within.start; + if range.is_none() { + let end = start + stride; + self.data.numberize(id, start .. end)?; + self.upper = within.end; + start = end; + } + + // Number the children. + let len = self.children.len(); + for child in &mut self.children[range.unwrap_or(0 .. len)] { + let end = start + child.descendants() as u64 * stride; + child.numberize(id, start .. end)?; + start = end; + } + + Ok(()) + } + + /// The upper bound of assigned numbers in this subtree. + pub fn upper(&self) -> u64 { + self.upper + } + + /// If the span points into this node, convert it to a byte range. + pub fn range(&self, span: Span, mut offset: usize) -> Option<Range<usize>> { + // Check whether we found it. + if let Some(range) = self.data.range(span, offset) { + return Some(range); + } + + // The parent of a subtree has a smaller span number than all of its + // descendants. Therefore, we can bail out early if the target span's + // number is smaller than our number. + if span.number() < self.span().number() { + return None; + } + + let mut children = self.children.iter().peekable(); + while let Some(child) = children.next() { + // Every node in this child's subtree has a smaller span number than + // the next sibling. Therefore we only need to recurse if the next + // sibling's span number is larger than the target span's number. + if children + .peek() + .map_or(true, |next| next.span().number() > span.number()) + { + if let Some(range) = child.range(span, offset) { + return Some(range); + } + } + + offset += child.len(); + } + + None + } + + /// The node's children, mutably. + pub(crate) fn children_mut(&mut self) -> &mut [SyntaxNode] { + &mut self.children + } + + /// Replaces a range of children with a replacement. + /// + /// May have mutated the children if it returns `Err(_)`. + pub(crate) fn replace_children( + &mut self, + mut range: Range<usize>, + replacement: Vec<SyntaxNode>, + ) -> NumberingResult { + let superseded = &self.children[range.clone()]; + + // Compute the new byte length. + self.data.len = self.data.len + + replacement.iter().map(SyntaxNode::len).sum::<usize>() + - superseded.iter().map(SyntaxNode::len).sum::<usize>(); + + // Compute the new number of descendants. + self.descendants = self.descendants + + replacement.iter().map(SyntaxNode::descendants).sum::<usize>() + - superseded.iter().map(SyntaxNode::descendants).sum::<usize>(); + + // Determine whether we're still erroneous after the replacement. That's + // the case if + // - any of the new nodes is erroneous, + // - or if we were erroneous before due to a non-superseded node. + self.erroneous = replacement.iter().any(SyntaxNode::erroneous) + || (self.erroneous + && (self.children[.. range.start].iter().any(SyntaxNode::erroneous)) + || self.children[range.end ..].iter().any(SyntaxNode::erroneous)); + + // Perform the replacement. + let replacement_count = replacement.len(); + self.children.splice(range.clone(), replacement); + range.end = range.start + replacement_count; + + // Renumber the new children. Retries until it works, taking + // exponentially more children into account. + let mut left = 0; + let mut right = 0; + let max_left = range.start; + let max_right = self.children.len() - range.end; + loop { + let renumber = range.start - left .. range.end + right; + + // The minimum assignable number is either + // - the upper bound of the node right before the to-be-renumbered + // children, + // - or this inner node's span number plus one if renumbering starts + // at the first child. + let start_number = renumber + .start + .checked_sub(1) + .and_then(|i| self.children.get(i)) + .map_or(self.span().number() + 1, |child| child.upper()); + + // The upper bound for renumbering is either + // - the span number of the first child after the to-be-renumbered + // children, + // - or this node's upper bound if renumbering ends behind the last + // child. + let end_number = self + .children + .get(renumber.end) + .map_or(self.upper(), |next| next.span().number()); + + // Try to renumber. + let within = start_number .. end_number; + let id = self.span().source(); + if self.numberize(id, Some(renumber), within).is_ok() { + return Ok(()); + } + + // If it didn't even work with all children, we give up. + if left == max_left && right == max_right { + return Err(Unnumberable); + } + + // Exponential expansion to both sides. + left = (left + 1).next_power_of_two().min(max_left); + right = (right + 1).next_power_of_two().min(max_right); + } + } + + /// Update this node after changes were made to one of its children. + pub(crate) fn update_parent( + &mut self, + prev_len: usize, + new_len: usize, + prev_descendants: usize, + new_descendants: usize, + ) { + self.data.len = self.data.len + new_len - prev_len; + self.descendants = self.descendants + new_descendants - prev_descendants; + self.erroneous = self.children.iter().any(SyntaxNode::erroneous); + } +} + +impl From<InnerNode> for SyntaxNode { + fn from(node: InnerNode) -> Self { + Arc::new(node).into() + } +} + +impl From<Arc<InnerNode>> for SyntaxNode { + fn from(node: Arc<InnerNode>) -> Self { + Self::Inner(node) + } +} + +impl Debug for InnerNode { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + self.data.fmt(f)?; + if !self.children.is_empty() { + f.write_str(" ")?; + f.debug_list().entries(&self.children).finish()?; + } + Ok(()) + } +} + +impl PartialEq for InnerNode { + fn eq(&self, other: &Self) -> bool { + self.data == other.data + && self.descendants == other.descendants + && self.erroneous == other.erroneous + && self.children == other.children + } +} + +/// Data shared between inner and leaf nodes. +#[derive(Clone, Hash)] +pub struct NodeData { + /// What kind of node this is (each kind would have its own struct in a + /// strongly typed AST). + pub(super) kind: NodeKind, + /// The byte length of the node in the source. + len: usize, + /// The node's span. + span: Span, +} + +impl NodeData { + /// Create new node metadata. + pub fn new(kind: NodeKind, len: usize) -> Self { + Self { len, kind, span: Span::detached() } + } + + /// The node's type. + pub fn kind(&self) -> &NodeKind { + &self.kind + } + + /// The node's length. + pub fn len(&self) -> usize { + self.len + } + + /// The node's span. + pub fn span(&self) -> Span { + self.span + } + + /// Set a synthetic span for the node. + pub fn synthesize(&mut self, span: Span) { + self.span = span; + } + + /// Assign a span to the node. + pub fn numberize(&mut self, id: SourceId, within: Range<u64>) -> NumberingResult { + if within.start < within.end { + self.span = Span::new(id, (within.start + within.end) / 2); + Ok(()) + } else { + Err(Unnumberable) + } + } + + /// If the span points into this node, convert it to a byte range. + pub fn range(&self, span: Span, offset: usize) -> Option<Range<usize>> { + (self.span == span).then(|| offset .. offset + self.len()) + } +} + +impl From<NodeData> for SyntaxNode { + fn from(token: NodeData) -> Self { + Self::Leaf(token) + } +} + +impl Debug for NodeData { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?}: {}", self.kind, self.len) + } +} + +impl PartialEq for NodeData { + fn eq(&self, other: &Self) -> bool { + self.kind == other.kind && self.len == other.len + } +} diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs new file mode 100644 index 00000000..83b333f4 --- /dev/null +++ b/src/syntax/parser.rs @@ -0,0 +1,558 @@ +use std::fmt::{self, Display, Formatter}; +use std::mem; +use std::ops::Range; + +use super::{ErrorPos, InnerNode, NodeData, NodeKind, SyntaxNode, TokenMode, Tokens}; +use crate::util::EcoString; + +/// A convenient token-based parser. +pub struct Parser<'s> { + /// An iterator over the source tokens. + tokens: Tokens<'s>, + /// Whether we are at the end of the file or of a group. + eof: bool, + /// The current token. + current: Option<NodeKind>, + /// The end byte index of the last non-trivia token. + prev_end: usize, + /// The start byte index of the peeked token. + current_start: usize, + /// The stack of open groups. + groups: Vec<GroupEntry>, + /// The children of the currently built node. + children: Vec<SyntaxNode>, + /// Whether the last group was not correctly terminated. + unterminated_group: bool, + /// Whether a group terminator was found that did not close a group. + stray_terminator: bool, +} + +impl<'s> Parser<'s> { + /// Create a new parser for the source string. + pub fn new(text: &'s str, mode: TokenMode) -> Self { + Self::with_prefix("", text, mode) + } + + /// Create a new parser for the source string that is prefixed by some text + /// that does not need to be parsed but taken into account for column + /// calculation. + pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { + let mut tokens = Tokens::with_prefix(prefix, text, mode); + let current = tokens.next(); + Self { + tokens, + eof: current.is_none(), + current, + prev_end: 0, + current_start: 0, + groups: vec![], + children: vec![], + unterminated_group: false, + stray_terminator: false, + } + } + + /// End the parsing process and return the parsed children. + pub fn finish(self) -> Vec<SyntaxNode> { + self.children + } + + /// End the parsing process and return + /// - the parsed children and whether the last token was terminated, if all + /// groups were terminated correctly, or + /// - `None` otherwise. + pub fn consume(self) -> Option<(Vec<SyntaxNode>, bool)> { + self.terminated().then(|| (self.children, self.tokens.terminated())) + } + + /// Create a new marker. + pub fn marker(&mut self) -> Marker { + Marker(self.children.len()) + } + + /// Create a marker right before the trailing trivia. + pub fn trivia_start(&self) -> Marker { + let count = self + .children + .iter() + .rev() + .take_while(|node| self.is_trivia(node.kind())) + .count(); + Marker(self.children.len() - count) + } + + /// Perform a subparse that wraps its result in a node with the given kind. + pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T + where + F: FnOnce(&mut Self) -> T, + { + let prev = mem::take(&mut self.children); + let output = f(self); + let until = self.trivia_start(); + let mut children = mem::replace(&mut self.children, prev); + + if self.tokens.mode() == TokenMode::Markup { + self.children.push(InnerNode::with_children(kind, children).into()); + } else { + // Trailing trivia should not be wrapped into the new node. + let idx = self.children.len(); + self.children.push(SyntaxNode::default()); + self.children.extend(children.drain(until.0 ..)); + self.children[idx] = InnerNode::with_children(kind, children).into(); + } + + output + } + + /// Whether the end of the source string or group is reached. + pub fn eof(&self) -> bool { + self.eof + } + + /// Consume the current token and also trailing trivia. + pub fn eat(&mut self) { + self.stray_terminator |= match self.current { + Some(NodeKind::RightParen) => !self.inside(Group::Paren), + Some(NodeKind::RightBracket) => !self.inside(Group::Bracket), + Some(NodeKind::RightBrace) => !self.inside(Group::Brace), + _ => false, + }; + + self.prev_end = self.tokens.cursor(); + self.bump(); + + if self.tokens.mode() != TokenMode::Markup { + // Skip whitespace and comments. + while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) { + self.bump(); + } + } + + self.repeek(); + } + + /// Consume the current token if it is the given one. + pub fn eat_if(&mut self, kind: NodeKind) -> bool { + let at = self.at(kind); + if at { + self.eat(); + } + at + } + + /// Eat tokens while the condition is true. + pub fn eat_while<F>(&mut self, mut f: F) + where + F: FnMut(&NodeKind) -> bool, + { + while self.peek().map_or(false, |t| f(t)) { + self.eat(); + } + } + + /// Consume the current token if it is the given one and produce an error if + /// not. + pub fn expect(&mut self, kind: NodeKind) -> ParseResult { + let at = self.peek() == Some(&kind); + if at { + self.eat(); + Ok(()) + } else { + self.expected(kind.name()); + Err(ParseError) + } + } + + /// Consume the current token, debug-asserting that it is the given one. + #[track_caller] + pub fn assert(&mut self, kind: NodeKind) { + debug_assert_eq!(self.peek(), Some(&kind)); + self.eat(); + } + + /// Whether the current token is of the given type. + pub fn at(&self, kind: NodeKind) -> bool { + self.peek() == Some(&kind) + } + + /// Peek at the current token without consuming it. + pub fn peek(&self) -> Option<&NodeKind> { + if self.eof { None } else { self.current.as_ref() } + } + + /// Peek at the current token, but only if it follows immediately after the + /// last one without any trivia in between. + pub fn peek_direct(&self) -> Option<&NodeKind> { + if self.prev_end() == self.current_start() { + self.peek() + } else { + None + } + } + + /// Peek at the source of the current token. + pub fn peek_src(&self) -> &'s str { + self.get(self.current_start() .. self.current_end()) + } + + /// Obtain a range of the source code. + pub fn get(&self, range: Range<usize>) -> &'s str { + self.tokens.scanner().get(range) + } + + /// The byte index at which the last non-trivia token ended. + pub fn prev_end(&self) -> usize { + self.prev_end + } + + /// The byte index at which the current token starts. + pub fn current_start(&self) -> usize { + self.current_start + } + + /// The byte index at which the current token ends. + pub fn current_end(&self) -> usize { + self.tokens.cursor() + } + + /// Determine the column index for the given byte index. + pub fn column(&self, index: usize) -> usize { + self.tokens.column(index) + } + + /// Continue parsing in a group. + /// + /// When the end delimiter of the group is reached, all subsequent calls to + /// `peek()` return `None`. Parsing can only continue with a matching call + /// to `end_group`. + /// + /// This panics if the current token does not start the given group. + #[track_caller] + pub fn start_group(&mut self, kind: Group) { + self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() }); + self.tokens.set_mode(match kind { + Group::Strong | Group::Emph => TokenMode::Markup, + Group::Bracket => match self.tokens.mode() { + TokenMode::Math => TokenMode::Math, + _ => TokenMode::Markup, + }, + Group::Brace | Group::Paren => match self.tokens.mode() { + TokenMode::Math => TokenMode::Math, + _ => TokenMode::Code, + }, + Group::Math => TokenMode::Math, + Group::Expr | Group::Imports => TokenMode::Code, + }); + + match kind { + Group::Brace => self.assert(NodeKind::LeftBrace), + Group::Bracket => self.assert(NodeKind::LeftBracket), + Group::Paren => self.assert(NodeKind::LeftParen), + Group::Strong => self.assert(NodeKind::Star), + Group::Emph => self.assert(NodeKind::Underscore), + Group::Math => self.assert(NodeKind::Dollar), + Group::Expr => self.repeek(), + Group::Imports => self.repeek(), + } + } + + /// End the parsing of a group. + /// + /// This panics if no group was started. + #[track_caller] + pub fn end_group(&mut self) { + let group_mode = self.tokens.mode(); + let group = self.groups.pop().expect("no started group"); + self.tokens.set_mode(group.prev_mode); + + let mut rescan = self.tokens.mode() != group_mode; + + // Eat the end delimiter if there is one. + if let Some((end, required)) = match group.kind { + Group::Brace => Some((NodeKind::RightBrace, true)), + Group::Bracket => Some((NodeKind::RightBracket, true)), + Group::Paren => Some((NodeKind::RightParen, true)), + Group::Strong => Some((NodeKind::Star, true)), + Group::Emph => Some((NodeKind::Underscore, true)), + Group::Math => Some((NodeKind::Dollar, true)), + Group::Expr => Some((NodeKind::Semicolon, false)), + Group::Imports => None, + } { + if self.current.as_ref() == Some(&end) { + // If another group closes after a group with the missing + // terminator, its scope of influence ends here and no longer + // taints the rest of the reparse. + self.unterminated_group = false; + + // Bump the delimeter and return. No need to rescan in this + // case. Also, we know that the delimiter is not stray even + // though we already removed the group. + let s = self.stray_terminator; + self.eat(); + self.stray_terminator = s; + rescan = false; + } else if required { + self.expected(end.name()); + self.unterminated_group = true; + } + } + + // Rescan the peeked token if the mode changed. + if rescan { + let mut target = self.prev_end(); + if group_mode != TokenMode::Markup { + let start = self.trivia_start().0; + target = self.current_start + - self.children[start ..].iter().map(SyntaxNode::len).sum::<usize>(); + self.children.truncate(start); + } + + self.tokens.jump(target); + self.prev_end = self.tokens.cursor(); + self.current_start = self.tokens.cursor(); + self.current = self.tokens.next(); + } + + self.repeek(); + } + + /// Checks if all groups were correctly terminated. + fn terminated(&self) -> bool { + self.groups.is_empty() && !self.unterminated_group && !self.stray_terminator + } + + /// Low-level bump that consumes exactly one token without special trivia + /// handling. + fn bump(&mut self) { + let kind = self.current.take().unwrap(); + let len = self.tokens.cursor() - self.current_start; + self.children.push(NodeData::new(kind, len).into()); + self.current_start = self.tokens.cursor(); + self.current = self.tokens.next(); + } + + /// Take another look at the current token to recheck whether it ends a + /// group. + fn repeek(&mut self) { + self.eof = match &self.current { + Some(NodeKind::RightBrace) => self.inside(Group::Brace), + Some(NodeKind::RightBracket) => self.inside(Group::Bracket), + Some(NodeKind::RightParen) => self.inside(Group::Paren), + Some(NodeKind::Star) => self.inside(Group::Strong), + Some(NodeKind::Underscore) => self.inside(Group::Emph), + Some(NodeKind::Dollar) => self.inside(Group::Math), + Some(NodeKind::Semicolon) => self.inside(Group::Expr), + Some(NodeKind::From) => self.inside(Group::Imports), + Some(NodeKind::Space { newlines }) => self.space_ends_group(*newlines), + Some(_) => false, + None => true, + }; + } + + /// Returns whether the given type can be skipped over. + fn is_trivia(&self, token: &NodeKind) -> bool { + match token { + NodeKind::Space { newlines } => !self.space_ends_group(*newlines), + NodeKind::LineComment => true, + NodeKind::BlockComment => true, + _ => false, + } + } + + /// Whether a space with the given number of newlines ends the current group. + fn space_ends_group(&self, n: usize) -> bool { + if n == 0 { + return false; + } + + match self.groups.last().map(|group| group.kind) { + Some(Group::Strong | Group::Emph) => n >= 2, + Some(Group::Imports) => n >= 1, + Some(Group::Expr) if n >= 1 => { + // Allow else and method call to continue on next line. + self.groups.iter().nth_back(1).map(|group| group.kind) + != Some(Group::Brace) + || !matches!( + self.tokens.clone().next(), + Some(NodeKind::Else | NodeKind::Dot) + ) + } + _ => false, + } + } + + /// Whether we are inside the given group (can be nested). + fn inside(&self, kind: Group) -> bool { + self.groups + .iter() + .rev() + .take_while(|g| !kind.is_weak() || g.kind.is_weak()) + .any(|g| g.kind == kind) + } +} + +/// Error handling. +impl Parser<'_> { + /// Eat the current token and add an error that it is unexpected. + pub fn unexpected(&mut self) { + if let Some(found) = self.peek() { + let msg = format_eco!("unexpected {}", found.name()); + let error = NodeKind::Error(ErrorPos::Full, msg); + self.perform(error, Self::eat); + } + } + + /// Add an error that the `thing` was expected at the end of the last + /// non-trivia token. + pub fn expected(&mut self, thing: &str) { + self.expected_at(self.trivia_start(), thing); + } + + /// Insert an error message that `what` was expected at the marker position. + pub fn expected_at(&mut self, marker: Marker, what: &str) { + let msg = format_eco!("expected {}", what); + let error = NodeKind::Error(ErrorPos::Full, msg); + self.children.insert(marker.0, NodeData::new(error, 0).into()); + } + + /// Eat the current token and add an error that it is not the expected + /// `thing`. + pub fn expected_found(&mut self, thing: &str) { + match self.peek() { + Some(found) => { + let msg = format_eco!("expected {}, found {}", thing, found.name()); + let error = NodeKind::Error(ErrorPos::Full, msg); + self.perform(error, Self::eat); + } + None => self.expected(thing), + } + } +} + +/// Marks a location in a parser's child list. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct Marker(usize); + +impl Marker { + /// Peek at the child directly before the marker. + pub fn before<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> { + p.children.get(self.0.checked_sub(1)?) + } + + /// Peek at the child directly after the marker. + pub fn after<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> { + p.children.get(self.0) + } + + /// Convert the child directly after marker. + pub fn convert(self, p: &mut Parser, kind: NodeKind) { + if let Some(child) = p.children.get_mut(self.0) { + child.convert(kind); + } + } + + /// Perform a subparse that wraps all children after the marker in a node + /// with the given kind. + pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T + where + F: FnOnce(&mut Parser) -> T, + { + let success = f(p); + self.end(p, kind); + success + } + + /// Wrap all children after the marker (excluding trailing trivia) in a node + /// with the given `kind`. + pub fn end(self, p: &mut Parser, kind: NodeKind) { + let until = p.trivia_start().0.max(self.0); + let children = p.children.drain(self.0 .. until).collect(); + p.children + .insert(self.0, InnerNode::with_children(kind, children).into()); + } + + /// Wrap all children that do not fulfill the predicate in error nodes. + pub fn filter_children<F>(self, p: &mut Parser, mut f: F) + where + F: FnMut(&SyntaxNode) -> Result<(), &'static str>, + { + for child in &mut p.children[self.0 ..] { + // Don't expose errors. + if child.kind().is_error() { + continue; + } + + // Don't expose trivia in code. + if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() { + continue; + } + + if let Err(msg) = f(child) { + let mut msg = EcoString::from(msg); + if msg.starts_with("expected") { + msg.push_str(", found "); + msg.push_str(child.kind().name()); + } + let error = NodeKind::Error(ErrorPos::Full, msg); + let inner = mem::take(child); + *child = InnerNode::with_child(error, inner).into(); + } + } + } +} + +/// A logical group of tokens, e.g. `[...]`. +#[derive(Debug)] +struct GroupEntry { + /// The kind of group this is. This decides which token(s) will end the + /// group. For example, a [`Group::Paren`] will be ended by + /// [`Token::RightParen`]. + pub kind: Group, + /// The mode the parser was in _before_ the group started (to which we go + /// back once the group ends). + pub prev_mode: TokenMode, +} + +/// A group, confined by optional start and end delimiters. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Group { + /// A curly-braced group: `{...}`. + Brace, + /// A bracketed group: `[...]`. + Bracket, + /// A parenthesized group: `(...)`. + Paren, + /// A group surrounded with stars: `*...*`. + Strong, + /// A group surrounded with underscore: `_..._`. + Emph, + /// A group surrounded by dollar signs: `$...$`. + Math, + /// A group ended by a semicolon or a line break: `;`, `\n`. + Expr, + /// A group for import items, ended by a semicolon, line break or `from`. + Imports, +} + +impl Group { + /// Whether the group can only force other weak groups to end. + fn is_weak(self) -> bool { + matches!(self, Group::Strong | Group::Emph) + } +} + +/// Allows parser methods to use the try operator. Never returned top-level +/// because the parser recovers from all errors. +pub type ParseResult<T = ()> = Result<T, ParseError>; + +/// The error type for parsing. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub struct ParseError; + +impl Display for ParseError { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.pad("failed to parse") + } +} + +impl std::error::Error for ParseError {} diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs new file mode 100644 index 00000000..10b4c4c2 --- /dev/null +++ b/src/syntax/parsing.rs @@ -0,0 +1,1140 @@ +use std::collections::HashSet; + +use super::ast::{Assoc, BinOp, UnOp}; +use super::{ + ErrorPos, Group, Marker, NodeKind, ParseError, ParseResult, Parser, SyntaxNode, + TokenMode, +}; +use crate::util::EcoString; + +/// Parse a source file. +pub fn parse(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, TokenMode::Markup); + markup(&mut p, true); + p.finish().into_iter().next().unwrap() +} + +/// Parse code directly, only used for syntax highlighting. +pub fn parse_code(text: &str) -> SyntaxNode { + let mut p = Parser::new(text, TokenMode::Code); + p.perform(NodeKind::CodeBlock, code); + p.finish().into_iter().next().unwrap() +} + +/// Reparse a code block. +/// +/// Returns `Some` if all of the input was consumed. +pub(crate) fn reparse_code_block( + prefix: &str, + text: &str, + end_pos: usize, +) -> Option<(Vec<SyntaxNode>, bool, usize)> { + let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); + if !p.at(NodeKind::LeftBrace) { + return None; + } + + code_block(&mut p); + + let (mut node, terminated) = p.consume()?; + let first = node.remove(0); + if first.len() != end_pos { + return None; + } + + Some((vec![first], terminated, 1)) +} + +/// Reparse a content block. +/// +/// Returns `Some` if all of the input was consumed. +pub(crate) fn reparse_content_block( + prefix: &str, + text: &str, + end_pos: usize, +) -> Option<(Vec<SyntaxNode>, bool, usize)> { + let mut p = Parser::with_prefix(prefix, text, TokenMode::Code); + if !p.at(NodeKind::LeftBracket) { + return None; + } + + content_block(&mut p); + + let (mut node, terminated) = p.consume()?; + let first = node.remove(0); + if first.len() != end_pos { + return None; + } + + Some((vec![first], terminated, 1)) +} + +/// Reparse a sequence markup elements without the topmost node. +/// +/// Returns `Some` if all of the input was consumed. +pub(crate) fn reparse_markup_elements( + prefix: &str, + text: &str, + end_pos: usize, + differential: isize, + reference: &[SyntaxNode], + mut at_start: bool, + min_indent: usize, +) -> Option<(Vec<SyntaxNode>, bool, usize)> { + let mut p = Parser::with_prefix(prefix, text, TokenMode::Markup); + + let mut node: Option<&SyntaxNode> = None; + let mut iter = reference.iter(); + let mut offset = differential; + let mut replaced = 0; + let mut stopped = false; + + 'outer: while !p.eof() { + if let Some(NodeKind::Space { newlines: (1 ..) }) = p.peek() { + if p.column(p.current_end()) < min_indent { + return None; + } + } + + markup_node(&mut p, &mut at_start); + + if p.prev_end() <= end_pos { + continue; + } + + let recent = p.marker().before(&p).unwrap(); + let recent_start = p.prev_end() - recent.len(); + + while offset <= recent_start as isize { + if let Some(node) = node { + // The nodes are equal, at the same position and have the + // same content. The parsing trees have converged again, so + // the reparse may stop here. + if offset == recent_start as isize && node == recent { + replaced -= 1; + stopped = true; + break 'outer; + } + } + + if let Some(node) = node { + offset += node.len() as isize; + } + + node = iter.next(); + if node.is_none() { + break; + } + + replaced += 1; + } + } + + if p.eof() && !stopped { + replaced = reference.len(); + } + + let (mut res, terminated) = p.consume()?; + if stopped { + res.pop().unwrap(); + } + + Some((res, terminated, replaced)) +} + +/// Parse markup. +/// +/// If `at_start` is true, things like headings that may only appear at the +/// beginning of a line or content block are initially allowed. +fn markup(p: &mut Parser, mut at_start: bool) { + p.perform(NodeKind::Markup { min_indent: 0 }, |p| { + while !p.eof() { + markup_node(p, &mut at_start); + } + }); +} + +/// Parse markup that stays right of the given `column`. +fn markup_indented(p: &mut Parser, min_indent: usize) { + p.eat_while(|t| match t { + NodeKind::Space { newlines } => *newlines == 0, + NodeKind::LineComment | NodeKind::BlockComment => true, + _ => false, + }); + + let marker = p.marker(); + let mut at_start = false; + + while !p.eof() { + match p.peek() { + Some(NodeKind::Space { newlines: (1 ..) }) + if p.column(p.current_end()) < min_indent => + { + break; + } + _ => {} + } + + markup_node(p, &mut at_start); + } + + marker.end(p, NodeKind::Markup { min_indent }); +} + +/// Parse a line of markup that can prematurely end if `f` returns true. +fn markup_line<F>(p: &mut Parser, mut f: F) +where + F: FnMut(&NodeKind) -> bool, +{ + p.eat_while(|t| match t { + NodeKind::Space { newlines } => *newlines == 0, + NodeKind::LineComment | NodeKind::BlockComment => true, + _ => false, + }); + + p.perform(NodeKind::Markup { min_indent: usize::MAX }, |p| { + let mut at_start = false; + while let Some(kind) = p.peek() { + if let NodeKind::Space { newlines: (1 ..) } = kind { + break; + } + + if f(kind) { + break; + } + + markup_node(p, &mut at_start); + } + }); +} + +/// Parse a markup node. +fn markup_node(p: &mut Parser, at_start: &mut bool) { + let token = match p.peek() { + Some(t) => t, + None => return, + }; + + match token { + // Whitespace. + NodeKind::Space { newlines } => { + *at_start |= *newlines > 0; + p.eat(); + return; + } + + // Comments. + NodeKind::LineComment | NodeKind::BlockComment => { + p.eat(); + return; + } + + // Text and markup. + NodeKind::Text(_) + | NodeKind::Linebreak + | NodeKind::SmartQuote { .. } + | NodeKind::Escape(_) + | NodeKind::Shorthand(_) + | NodeKind::Link(_) + | NodeKind::Raw(_) + | NodeKind::Label(_) + | NodeKind::Ref(_) => p.eat(), + + // Math. + NodeKind::Dollar => math(p), + + // Strong, emph, heading. + NodeKind::Star => strong(p), + NodeKind::Underscore => emph(p), + NodeKind::Eq => heading(p, *at_start), + + // Lists. + NodeKind::Minus => list_node(p, *at_start), + NodeKind::Plus | NodeKind::EnumNumbering(_) => enum_node(p, *at_start), + NodeKind::Slash => { + desc_node(p, *at_start).ok(); + } + NodeKind::Colon => { + let marker = p.marker(); + p.eat(); + marker.convert(p, NodeKind::Text(':'.into())); + } + + // Hashtag + keyword / identifier. + NodeKind::Ident(_) + | NodeKind::Let + | NodeKind::Set + | NodeKind::Show + | NodeKind::Wrap + | NodeKind::If + | NodeKind::While + | NodeKind::For + | NodeKind::Import + | NodeKind::Include + | NodeKind::Break + | NodeKind::Continue + | NodeKind::Return => markup_expr(p), + + // Code and content block. + NodeKind::LeftBrace => code_block(p), + NodeKind::LeftBracket => content_block(p), + + NodeKind::Error(_, _) => p.eat(), + _ => p.unexpected(), + }; + + *at_start = false; +} + +/// Parse strong content. +fn strong(p: &mut Parser) { + p.perform(NodeKind::Strong, |p| { + p.start_group(Group::Strong); + markup(p, false); + p.end_group(); + }) +} + +/// Parse emphasized content. +fn emph(p: &mut Parser) { + p.perform(NodeKind::Emph, |p| { + p.start_group(Group::Emph); + markup(p, false); + p.end_group(); + }) +} + +/// Parse a heading. +fn heading(p: &mut Parser, at_start: bool) { + let marker = p.marker(); + let current_start = p.current_start(); + p.assert(NodeKind::Eq); + while p.eat_if(NodeKind::Eq) {} + + if at_start && p.peek().map_or(true, |kind| kind.is_space()) { + p.eat_while(|kind| *kind == NodeKind::Space { newlines: 0 }); + markup_line(p, |kind| matches!(kind, NodeKind::Label(_))); + marker.end(p, NodeKind::Heading); + } else { + let text = p.get(current_start .. p.prev_end()).into(); + marker.convert(p, NodeKind::Text(text)); + } +} + +/// Parse a single list item. +fn list_node(p: &mut Parser, at_start: bool) { + let marker = p.marker(); + let text: EcoString = p.peek_src().into(); + p.assert(NodeKind::Minus); + + let min_indent = p.column(p.prev_end()); + if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { + markup_indented(p, min_indent); + marker.end(p, NodeKind::ListItem); + } else { + marker.convert(p, NodeKind::Text(text)); + } +} + +/// Parse a single enum item. +fn enum_node(p: &mut Parser, at_start: bool) { + let marker = p.marker(); + let text: EcoString = p.peek_src().into(); + p.eat(); + + let min_indent = p.column(p.prev_end()); + if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { + markup_indented(p, min_indent); + marker.end(p, NodeKind::EnumItem); + } else { + marker.convert(p, NodeKind::Text(text)); + } +} + +/// Parse a single description list item. +fn desc_node(p: &mut Parser, at_start: bool) -> ParseResult { + let marker = p.marker(); + let text: EcoString = p.peek_src().into(); + p.eat(); + + let min_indent = p.column(p.prev_end()); + if at_start && p.eat_if(NodeKind::Space { newlines: 0 }) && !p.eof() { + markup_line(p, |node| matches!(node, NodeKind::Colon)); + p.expect(NodeKind::Colon)?; + markup_indented(p, min_indent); + marker.end(p, NodeKind::DescItem); + } else { + marker.convert(p, NodeKind::Text(text)); + } + + Ok(()) +} + +/// Parse an expression within a markup mode. +fn markup_expr(p: &mut Parser) { + // Does the expression need termination or can content follow directly? + let stmt = matches!( + p.peek(), + Some( + NodeKind::Let + | NodeKind::Set + | NodeKind::Show + | NodeKind::Wrap + | NodeKind::Import + | NodeKind::Include + ) + ); + + p.start_group(Group::Expr); + let res = expr_prec(p, true, 0); + if stmt && res.is_ok() && !p.eof() { + p.expected("semicolon or line break"); + } + p.end_group(); +} + +/// Parse math. +fn math(p: &mut Parser) { + p.perform(NodeKind::Math, |p| { + p.start_group(Group::Math); + while !p.eof() { + math_node(p); + } + p.end_group(); + }); +} + +/// Parse a math node. +fn math_node(p: &mut Parser) { + math_node_prec(p, 0, None) +} + +/// Parse a math node with operators having at least the minimum precedence. +fn math_node_prec(p: &mut Parser, min_prec: usize, stop: Option<NodeKind>) { + let marker = p.marker(); + math_primary(p); + + loop { + let (kind, mut prec, assoc, stop) = match p.peek() { + v if v == stop.as_ref() => break, + Some(NodeKind::Underscore) => { + (NodeKind::Script, 2, Assoc::Right, Some(NodeKind::Hat)) + } + Some(NodeKind::Hat) => ( + NodeKind::Script, + 2, + Assoc::Right, + Some(NodeKind::Underscore), + ), + Some(NodeKind::Slash) => (NodeKind::Frac, 1, Assoc::Left, None), + _ => break, + }; + + if prec < min_prec { + break; + } + + match assoc { + Assoc::Left => prec += 1, + Assoc::Right => {} + } + + p.eat(); + math_node_prec(p, prec, stop); + + // Allow up to two different scripts. We do not risk encountering the + // previous script kind again here due to right-associativity. + if p.eat_if(NodeKind::Underscore) || p.eat_if(NodeKind::Hat) { + math_node_prec(p, prec, None); + } + + marker.end(p, kind); + } +} + +/// Parse a primary math node. +fn math_primary(p: &mut Parser) { + let token = match p.peek() { + Some(t) => t, + None => return, + }; + + match token { + // Spaces, atoms and expressions. + NodeKind::Space { .. } + | NodeKind::Linebreak + | NodeKind::Escape(_) + | NodeKind::Atom(_) + | NodeKind::Ident(_) => p.eat(), + + // Groups. + NodeKind::LeftParen => group(p, Group::Paren, '(', ')'), + NodeKind::LeftBracket => group(p, Group::Bracket, '[', ']'), + NodeKind::LeftBrace => group(p, Group::Brace, '{', '}'), + + // Alignment indactor. + NodeKind::Amp => align(p), + + _ => p.unexpected(), + } +} + +/// Parse grouped math. +fn group(p: &mut Parser, group: Group, l: char, r: char) { + p.perform(NodeKind::Math, |p| { + let marker = p.marker(); + p.start_group(group); + marker.convert(p, NodeKind::Atom(l.into())); + while !p.eof() { + math_node(p); + } + let marker = p.marker(); + p.end_group(); + marker.convert(p, NodeKind::Atom(r.into())); + }) +} + +/// Parse an alignment indicator. +fn align(p: &mut Parser) { + p.perform(NodeKind::Align, |p| { + p.assert(NodeKind::Amp); + while p.eat_if(NodeKind::Amp) {} + }) +} + +/// Parse an expression. +fn expr(p: &mut Parser) -> ParseResult { + expr_prec(p, false, 0) +} + +/// Parse an expression with operators having at least the minimum precedence. +/// +/// If `atomic` is true, this does not parse binary operations and arrow +/// functions, which is exactly what we want in a shorthand expression directly +/// in markup. +/// +/// Stops parsing at operations with lower precedence than `min_prec`, +fn expr_prec(p: &mut Parser, atomic: bool, min_prec: usize) -> ParseResult { + let marker = p.marker(); + + // Start the unary expression. + match p.peek().and_then(UnOp::from_token) { + Some(op) if !atomic => { + p.eat(); + let prec = op.precedence(); + expr_prec(p, atomic, prec)?; + marker.end(p, NodeKind::Unary); + } + _ => primary(p, atomic)?, + }; + + loop { + // Parenthesis or bracket means this is a function call. + if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { + marker.perform(p, NodeKind::FuncCall, args)?; + continue; + } + + if atomic { + break; + } + + // Method call or field access. + if p.eat_if(NodeKind::Dot) { + ident(p)?; + if let Some(NodeKind::LeftParen | NodeKind::LeftBracket) = p.peek_direct() { + marker.perform(p, NodeKind::MethodCall, args)?; + } else { + marker.end(p, NodeKind::FieldAccess); + } + continue; + } + + let op = if p.eat_if(NodeKind::Not) { + if p.at(NodeKind::In) { + BinOp::NotIn + } else { + p.expected("keyword `in`"); + return Err(ParseError); + } + } else { + match p.peek().and_then(BinOp::from_token) { + Some(binop) => binop, + None => break, + } + }; + + let mut prec = op.precedence(); + if prec < min_prec { + break; + } + + p.eat(); + + match op.assoc() { + Assoc::Left => prec += 1, + Assoc::Right => {} + } + + marker.perform(p, NodeKind::Binary, |p| expr_prec(p, atomic, prec))?; + } + + Ok(()) +} + +/// Parse a primary expression. +fn primary(p: &mut Parser, atomic: bool) -> ParseResult { + if literal(p) { + return Ok(()); + } + + match p.peek() { + // Things that start with an identifier. + Some(NodeKind::Ident(_)) => { + let marker = p.marker(); + p.eat(); + + // Arrow means this is a closure's lone parameter. + if !atomic && p.at(NodeKind::Arrow) { + marker.end(p, NodeKind::Params); + p.assert(NodeKind::Arrow); + marker.perform(p, NodeKind::Closure, expr) + } else { + Ok(()) + } + } + + // Structures. + Some(NodeKind::LeftParen) => parenthesized(p, atomic), + Some(NodeKind::LeftBrace) => Ok(code_block(p)), + Some(NodeKind::LeftBracket) => Ok(content_block(p)), + + // Keywords. + Some(NodeKind::Let) => let_expr(p), + Some(NodeKind::Set) => set_expr(p), + Some(NodeKind::Show) => show_expr(p), + Some(NodeKind::Wrap) => wrap_expr(p), + Some(NodeKind::If) => if_expr(p), + Some(NodeKind::While) => while_expr(p), + Some(NodeKind::For) => for_expr(p), + Some(NodeKind::Import) => import_expr(p), + Some(NodeKind::Include) => include_expr(p), + Some(NodeKind::Break) => break_expr(p), + Some(NodeKind::Continue) => continue_expr(p), + Some(NodeKind::Return) => return_expr(p), + + Some(NodeKind::Error(_, _)) => { + p.eat(); + Err(ParseError) + } + + // Nothing. + _ => { + p.expected_found("expression"); + Err(ParseError) + } + } +} + +/// Parse a literal. +fn literal(p: &mut Parser) -> bool { + match p.peek() { + // Basic values. + Some( + NodeKind::None + | NodeKind::Auto + | NodeKind::Int(_) + | NodeKind::Float(_) + | NodeKind::Bool(_) + | NodeKind::Numeric(_, _) + | NodeKind::Str(_), + ) => { + p.eat(); + true + } + + _ => false, + } +} + +/// Parse an identifier. +fn ident(p: &mut Parser) -> ParseResult { + match p.peek() { + Some(NodeKind::Ident(_)) => { + p.eat(); + Ok(()) + } + _ => { + p.expected_found("identifier"); + Err(ParseError) + } + } +} + +/// Parse something that starts with a parenthesis, which can be either of: +/// - Array literal +/// - Dictionary literal +/// - Parenthesized expression +/// - Parameter list of closure expression +fn parenthesized(p: &mut Parser, atomic: bool) -> ParseResult { + let marker = p.marker(); + + p.start_group(Group::Paren); + let colon = p.eat_if(NodeKind::Colon); + let kind = collection(p, true).0; + p.end_group(); + + // Leading colon makes this a dictionary. + if colon { + dict(p, marker); + return Ok(()); + } + + // Arrow means this is a closure's parameter list. + if !atomic && p.at(NodeKind::Arrow) { + params(p, marker); + p.assert(NodeKind::Arrow); + return marker.perform(p, NodeKind::Closure, expr); + } + + // Transform into the identified collection. + match kind { + CollectionKind::Group => marker.end(p, NodeKind::Parenthesized), + CollectionKind::Positional => array(p, marker), + CollectionKind::Named => dict(p, marker), + } + + Ok(()) +} + +/// The type of a collection. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum CollectionKind { + /// The collection is only one item and has no comma. + Group, + /// The collection starts with a positional item and has multiple items or a + /// trailing comma. + Positional, + /// The collection starts with a colon or named item. + Named, +} + +/// Parse a collection. +/// +/// Returns the length of the collection and whether the literal contained any +/// commas. +fn collection(p: &mut Parser, keyed: bool) -> (CollectionKind, usize) { + let mut kind = None; + let mut items = 0; + let mut can_group = true; + let mut missing_coma: Option<Marker> = None; + + while !p.eof() { + if let Ok(item_kind) = item(p, keyed) { + match item_kind { + NodeKind::Spread => can_group = false, + NodeKind::Named if kind.is_none() => { + kind = Some(CollectionKind::Named); + can_group = false; + } + _ if kind.is_none() => { + kind = Some(CollectionKind::Positional); + } + _ => {} + } + + items += 1; + + if let Some(marker) = missing_coma.take() { + p.expected_at(marker, "comma"); + } + + if p.eof() { + break; + } + + if p.eat_if(NodeKind::Comma) { + can_group = false; + } else { + missing_coma = Some(p.trivia_start()); + } + } else { + p.eat_if(NodeKind::Comma); + kind = Some(CollectionKind::Group); + } + } + + let kind = if can_group && items == 1 { + CollectionKind::Group + } else { + kind.unwrap_or(CollectionKind::Positional) + }; + + (kind, items) +} + +/// Parse an expression or a named pair, returning whether it's a spread or a +/// named pair. +fn item(p: &mut Parser, keyed: bool) -> ParseResult<NodeKind> { + let marker = p.marker(); + if p.eat_if(NodeKind::Dots) { + marker.perform(p, NodeKind::Spread, expr)?; + return Ok(NodeKind::Spread); + } + + expr(p)?; + + if p.at(NodeKind::Colon) { + match marker.after(p).map(|c| c.kind()) { + Some(NodeKind::Ident(_)) => { + p.eat(); + marker.perform(p, NodeKind::Named, expr)?; + } + Some(NodeKind::Str(_)) if keyed => { + p.eat(); + marker.perform(p, NodeKind::Keyed, expr)?; + } + kind => { + let mut msg = EcoString::from("expected identifier"); + if keyed { + msg.push_str(" or string"); + } + if let Some(kind) = kind { + msg.push_str(", found "); + msg.push_str(kind.name()); + } + let error = NodeKind::Error(ErrorPos::Full, msg); + marker.end(p, error); + p.eat(); + marker.perform(p, NodeKind::Named, expr).ok(); + return Err(ParseError); + } + } + + Ok(NodeKind::Named) + } else { + Ok(NodeKind::None) + } +} + +/// Convert a collection into an array, producing errors for anything other than +/// expressions. +fn array(p: &mut Parser, marker: Marker) { + marker.filter_children(p, |x| match x.kind() { + NodeKind::Named | NodeKind::Keyed => Err("expected expression"), + _ => Ok(()), + }); + marker.end(p, NodeKind::Array); +} + +/// Convert a collection into a dictionary, producing errors for anything other +/// than named and keyed pairs. +fn dict(p: &mut Parser, marker: Marker) { + let mut used = HashSet::new(); + marker.filter_children(p, |x| match x.kind() { + kind if kind.is_paren() => Ok(()), + NodeKind::Named | NodeKind::Keyed => { + if let Some(NodeKind::Ident(key) | NodeKind::Str(key)) = + x.children().next().map(|child| child.kind()) + { + if !used.insert(key.clone()) { + return Err("pair has duplicate key"); + } + } + Ok(()) + } + NodeKind::Spread | NodeKind::Comma | NodeKind::Colon => Ok(()), + _ => Err("expected named or keyed pair"), + }); + marker.end(p, NodeKind::Dict); +} + +/// Convert a collection into a list of parameters, producing errors for +/// anything other than identifiers, spread operations and named pairs. +fn params(p: &mut Parser, marker: Marker) { + marker.filter_children(p, |x| match x.kind() { + kind if kind.is_paren() => Ok(()), + NodeKind::Named | NodeKind::Ident(_) | NodeKind::Comma => Ok(()), + NodeKind::Spread + if matches!( + x.children().last().map(|child| child.kind()), + Some(&NodeKind::Ident(_)) + ) => + { + Ok(()) + } + _ => Err("expected identifier, named pair or argument sink"), + }); + marker.end(p, NodeKind::Params); +} + +/// Parse a code block: `{...}`. +fn code_block(p: &mut Parser) { + p.perform(NodeKind::CodeBlock, |p| { + p.start_group(Group::Brace); + code(p); + p.end_group(); + }); +} + +/// Parse expressions. +fn code(p: &mut Parser) { + while !p.eof() { + p.start_group(Group::Expr); + if expr(p).is_ok() && !p.eof() { + p.expected("semicolon or line break"); + } + p.end_group(); + + // Forcefully skip over newlines since the group's contents can't. + p.eat_while(NodeKind::is_space); + } +} + +/// Parse a content block: `[...]`. +fn content_block(p: &mut Parser) { + p.perform(NodeKind::ContentBlock, |p| { + p.start_group(Group::Bracket); + markup(p, true); + p.end_group(); + }); +} + +/// Parse the arguments to a function call. +fn args(p: &mut Parser) -> ParseResult { + match p.peek_direct() { + Some(NodeKind::LeftParen) => {} + Some(NodeKind::LeftBracket) => {} + _ => { + p.expected_found("argument list"); + return Err(ParseError); + } + } + + p.perform(NodeKind::Args, |p| { + if p.at(NodeKind::LeftParen) { + let marker = p.marker(); + p.start_group(Group::Paren); + collection(p, false); + p.end_group(); + + let mut used = HashSet::new(); + marker.filter_children(p, |x| match x.kind() { + NodeKind::Named => { + if let Some(NodeKind::Ident(ident)) = + x.children().next().map(|child| child.kind()) + { + if !used.insert(ident.clone()) { + return Err("duplicate argument"); + } + } + Ok(()) + } + _ => Ok(()), + }); + } + + while p.peek_direct() == Some(&NodeKind::LeftBracket) { + content_block(p); + } + }); + + Ok(()) +} + +/// Parse a let expression. +fn let_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::LetBinding, |p| { + p.assert(NodeKind::Let); + + let marker = p.marker(); + ident(p)?; + + // If a parenthesis follows, this is a function definition. + let has_params = p.peek_direct() == Some(&NodeKind::LeftParen); + if has_params { + let marker = p.marker(); + p.start_group(Group::Paren); + collection(p, false); + p.end_group(); + params(p, marker); + } + + if p.eat_if(NodeKind::Eq) { + expr(p)?; + } else if has_params { + // Function definitions must have a body. + p.expected("body"); + } + + // Rewrite into a closure expression if it's a function definition. + if has_params { + marker.end(p, NodeKind::Closure); + } + + Ok(()) + }) +} + +/// Parse a set expression. +fn set_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::SetRule, |p| { + p.assert(NodeKind::Set); + ident(p)?; + args(p) + }) +} + +/// Parse a show expression. +fn show_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ShowRule, |p| { + p.assert(NodeKind::Show); + let marker = p.marker(); + expr(p)?; + if p.eat_if(NodeKind::Colon) { + marker.filter_children(p, |child| match child.kind() { + NodeKind::Ident(_) | NodeKind::Colon => Ok(()), + _ => Err("expected identifier"), + }); + expr(p)?; + } + p.expect(NodeKind::As)?; + expr(p) + }) +} + +/// Parse a wrap expression. +fn wrap_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::WrapRule, |p| { + p.assert(NodeKind::Wrap); + ident(p)?; + p.expect(NodeKind::In)?; + expr(p) + }) +} + +/// Parse an if-else expresion. +fn if_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::Conditional, |p| { + p.assert(NodeKind::If); + + expr(p)?; + body(p)?; + + if p.eat_if(NodeKind::Else) { + if p.at(NodeKind::If) { + if_expr(p)?; + } else { + body(p)?; + } + } + + Ok(()) + }) +} + +/// Parse a while expresion. +fn while_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::WhileLoop, |p| { + p.assert(NodeKind::While); + expr(p)?; + body(p) + }) +} + +/// Parse a for-in expression. +fn for_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ForLoop, |p| { + p.assert(NodeKind::For); + for_pattern(p)?; + p.expect(NodeKind::In)?; + expr(p)?; + body(p) + }) +} + +/// Parse a for loop pattern. +fn for_pattern(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ForPattern, |p| { + ident(p)?; + if p.eat_if(NodeKind::Comma) { + ident(p)?; + } + Ok(()) + }) +} + +/// Parse an import expression. +fn import_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ModuleImport, |p| { + p.assert(NodeKind::Import); + + if !p.eat_if(NodeKind::Star) { + // This is the list of identifiers scenario. + p.perform(NodeKind::ImportItems, |p| { + p.start_group(Group::Imports); + let marker = p.marker(); + let items = collection(p, false).1; + if items == 0 { + p.expected("import items"); + } + p.end_group(); + + marker.filter_children(p, |n| match n.kind() { + NodeKind::Ident(_) | NodeKind::Comma => Ok(()), + _ => Err("expected identifier"), + }); + }); + }; + + p.expect(NodeKind::From)?; + expr(p) + }) +} + +/// Parse an include expression. +fn include_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ModuleInclude, |p| { + p.assert(NodeKind::Include); + expr(p) + }) +} + +/// Parse a break expression. +fn break_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::BreakStmt, |p| { + p.assert(NodeKind::Break); + Ok(()) + }) +} + +/// Parse a continue expression. +fn continue_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ContinueStmt, |p| { + p.assert(NodeKind::Continue); + Ok(()) + }) +} + +/// Parse a return expression. +fn return_expr(p: &mut Parser) -> ParseResult { + p.perform(NodeKind::ReturnStmt, |p| { + p.assert(NodeKind::Return); + if !p.at(NodeKind::Comma) && !p.eof() { + expr(p)?; + } + Ok(()) + }) +} + +/// Parse a control flow body. +fn body(p: &mut Parser) -> ParseResult { + match p.peek() { + Some(NodeKind::LeftBracket) => Ok(content_block(p)), + Some(NodeKind::LeftBrace) => Ok(code_block(p)), + _ => { + p.expected("body"); + Err(ParseError) + } + } +} diff --git a/src/syntax/resolve.rs b/src/syntax/resolve.rs new file mode 100644 index 00000000..2ad35cec --- /dev/null +++ b/src/syntax/resolve.rs @@ -0,0 +1,237 @@ +use unscanny::Scanner; + +use super::{is_ident, is_newline, RawKind}; +use crate::util::EcoString; + +/// Resolve all escape sequences in a string. +pub fn resolve_string(string: &str) -> EcoString { + let mut out = EcoString::with_capacity(string.len()); + let mut s = Scanner::new(string); + + while let Some(c) = s.eat() { + if c != '\\' { + out.push(c); + continue; + } + + let start = s.locate(-1); + match s.eat() { + Some('\\') => out.push('\\'), + Some('"') => out.push('"'), + Some('n') => out.push('\n'), + Some('r') => out.push('\r'), + Some('t') => out.push('\t'), + Some('u') if s.eat_if('{') => { + // TODO: Error if closing brace is missing. + let sequence = s.eat_while(char::is_ascii_hexdigit); + let _terminated = s.eat_if('}'); + match resolve_hex(sequence) { + Some(c) => out.push(c), + None => out.push_str(s.from(start)), + } + } + + _ => out.push_str(s.from(start)), + } + } + + out +} + +/// Resolve a hexadecimal escape sequence into a character +/// (only the inner hex letters without braces or `\u`). +pub fn resolve_hex(sequence: &str) -> Option<char> { + u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) +} + +/// Resolve the language tag and trim the raw text. +pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawKind { + if backticks > 1 { + let (tag, inner) = split_at_lang_tag(text); + let (text, block) = trim_and_split_raw(column, inner); + RawKind { + lang: is_ident(tag).then(|| tag.into()), + text: text.into(), + block, + } + } else { + RawKind { + lang: None, + text: split_lines(text).join("\n").into(), + block: false, + } + } +} + +/// Parse the lang tag and return it alongside the remaining inner raw text. +fn split_at_lang_tag(raw: &str) -> (&str, &str) { + let mut s = Scanner::new(raw); + ( + s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)), + s.after(), + ) +} + +/// Trim raw text and splits it into lines. +/// +/// Also returns whether at least one newline was contained in `raw`. +fn trim_and_split_raw(column: usize, mut raw: &str) -> (String, bool) { + // Trims one space at the start. + raw = raw.strip_prefix(' ').unwrap_or(raw); + + // Trim one space at the end if the last non-whitespace char is a backtick. + if raw.trim_end().ends_with('`') { + raw = raw.strip_suffix(' ').unwrap_or(raw); + } + + let mut lines = split_lines(raw); + + // Dedent based on column, but not for the first line. + for line in lines.iter_mut().skip(1) { + let offset = line + .chars() + .take(column) + .take_while(|c| c.is_whitespace()) + .map(char::len_utf8) + .sum(); + *line = &line[offset ..]; + } + + let had_newline = lines.len() > 1; + let is_whitespace = |line: &&str| line.chars().all(char::is_whitespace); + + // Trims a sequence of whitespace followed by a newline at the start. + if lines.first().map_or(false, is_whitespace) { + lines.remove(0); + } + + // Trims a newline followed by a sequence of whitespace at the end. + if lines.last().map_or(false, is_whitespace) { + lines.pop(); + } + + (lines.join("\n"), had_newline) +} + +/// Split a string into a vector of lines +/// (respecting Unicode, Unix, Mac and Windows line breaks). +fn split_lines(text: &str) -> Vec<&str> { + let mut s = Scanner::new(text); + let mut lines = Vec::new(); + let mut start = 0; + let mut end = 0; + + while let Some(c) = s.eat() { + if is_newline(c) { + if c == '\r' { + s.eat_if('\n'); + } + + lines.push(&text[start .. end]); + start = s.cursor(); + } + end = s.cursor(); + } + + lines.push(&text[start ..]); + lines +} + +#[cfg(test)] +#[rustfmt::skip] +mod tests { + use super::*; + + #[test] + fn test_resolve_strings() { + #[track_caller] + fn test(string: &str, expected: &str) { + assert_eq!(resolve_string(string), expected); + } + + test(r#"hello world"#, "hello world"); + test(r#"hello\nworld"#, "hello\nworld"); + test(r#"a\"bc"#, "a\"bc"); + test(r#"a\u{2603}bc"#, "a☃bc"); + test(r#"a\u{26c3bg"#, "a𦰻g"); + test(r#"av\u{6797"#, "av林"); + test(r#"a\\"#, "a\\"); + test(r#"a\\\nbc"#, "a\\\nbc"); + test(r#"a\t\r\nbc"#, "a\t\r\nbc"); + test(r"🌎", "🌎"); + test(r"🌎\", r"🌎\"); + test(r"\🌎", r"\🌎"); + } + + #[test] + fn test_split_at_lang_tag() { + #[track_caller] + fn test(text: &str, lang: &str, inner: &str) { + assert_eq!(split_at_lang_tag(text), (lang, inner)); + } + + test("typst it!", "typst", " it!"); + test("typst\n it!", "typst", "\n it!"); + test("typst\n it!", "typst", "\n it!"); + test("abc`", "abc", "`"); + test(" hi", "", " hi"); + test("`", "", "`"); + } + + #[test] + fn test_resolve_raw() { + #[track_caller] + fn test( + column: usize, + backticks: usize, + raw: &str, + lang: Option<&str>, + text: &str, + block: bool, + ) { + let node = resolve_raw(column, backticks, raw); + assert_eq!(node.lang.as_deref(), lang); + assert_eq!(node.text, text); + assert_eq!(node.block, block); + } + + // Just one backtick. + test(0, 1, "py", None, "py", false); + test(0, 1, "1\n2", None, "1\n2", false); + test(0, 1, "1\r\n2", None, "1\n2", false); + + // More than one backtick with lang tag. + test(0, 2, "js alert()", Some("js"), "alert()", false); + test(0, 3, "py quit(\n\n)", Some("py"), "quit(\n\n)", true); + test(0, 2, "♥", None, "", false); + + // Trimming of whitespace (tested more thoroughly in separate test). + test(0, 2, " a", None, "a", false); + test(0, 2, " a", None, " a", false); + test(0, 2, " \na", None, "a", true); + + // Dedenting + test(2, 3, " def foo():\n bar()", None, "def foo():\n bar()", true); + } + + #[test] + fn test_trim_raw() { + #[track_caller] + fn test(text: &str, expected: &str) { + assert_eq!(trim_and_split_raw(0, text).0, expected); + } + + test(" hi", "hi"); + test(" hi", " hi"); + test("\nhi", "hi"); + test(" \n hi", " hi"); + test("hi` ", "hi`"); + test("hi` ", "hi` "); + test("hi` ", "hi` "); + test("hi ", "hi "); + test("hi ", "hi "); + test("hi\n", "hi"); + test("hi \n ", "hi "); + test(" \n hi \n ", " hi "); + } +} diff --git a/src/syntax/source.rs b/src/syntax/source.rs new file mode 100644 index 00000000..1b87b1c9 --- /dev/null +++ b/src/syntax/source.rs @@ -0,0 +1,430 @@ +//! Source file management. + +use std::fmt::{self, Debug, Formatter}; +use std::hash::{Hash, Hasher}; +use std::ops::Range; +use std::path::{Path, PathBuf}; + +use comemo::Prehashed; +use unscanny::Scanner; + +use crate::diag::SourceResult; +use crate::syntax::ast::Markup; +use crate::syntax::{is_newline, parse, reparse}; +use crate::syntax::{Span, SyntaxNode}; +use crate::util::{PathExt, StrExt}; + +/// A source file. +/// +/// All line and column indices start at zero, just like byte indices. Only for +/// user-facing display, you should add 1 to them. +pub struct Source { + id: SourceId, + path: PathBuf, + text: Prehashed<String>, + lines: Vec<Line>, + root: Prehashed<SyntaxNode>, +} + +impl Source { + /// Create a new source file. + pub fn new(id: SourceId, path: &Path, text: String) -> Self { + let lines = std::iter::once(Line { byte_idx: 0, utf16_idx: 0 }) + .chain(lines(0, 0, &text)) + .collect(); + + let mut root = parse(&text); + root.numberize(id, Span::FULL).unwrap(); + + Self { + id, + path: path.normalize(), + text: Prehashed::new(text), + lines, + root: Prehashed::new(root), + } + } + + /// Create a source file without a real id and path, usually for testing. + pub fn detached(text: impl Into<String>) -> Self { + Self::new(SourceId::detached(), Path::new(""), text.into()) + } + + /// Create a source file with the same synthetic span for all nodes. + pub fn synthesized(text: impl Into<String>, span: Span) -> Self { + let mut file = Self::detached(text); + let mut root = file.root.into_inner(); + root.synthesize(span); + file.root = Prehashed::new(root); + file.id = span.source(); + file + } + + /// The root node of the file's untyped syntax tree. + pub fn root(&self) -> &SyntaxNode { + &self.root + } + + /// The root node of the file's typed abstract syntax tree. + pub fn ast(&self) -> SourceResult<Markup> { + let errors = self.root.errors(); + if errors.is_empty() { + Ok(self.root.cast().expect("root node must be markup")) + } else { + Err(Box::new(errors)) + } + } + + /// The id of the source file. + pub fn id(&self) -> SourceId { + self.id + } + + /// The normalized path to the source file. + pub fn path(&self) -> &Path { + &self.path + } + + /// The whole source as a string slice. + pub fn text(&self) -> &str { + &self.text + } + + /// Slice out the part of the source code enclosed by the range. + pub fn get(&self, range: Range<usize>) -> Option<&str> { + self.text.get(range) + } + + /// Fully replace the source text and increase the revision number. + pub fn replace(&mut self, text: String) { + self.text = Prehashed::new(text); + self.lines = vec![Line { byte_idx: 0, utf16_idx: 0 }]; + self.lines.extend(lines(0, 0, &self.text)); + let mut root = parse(&self.text); + root.numberize(self.id(), Span::FULL).unwrap(); + self.root = Prehashed::new(root); + } + + /// Edit the source file by replacing the given range and increase the + /// revision number. + /// + /// Returns the range in the new source that was ultimately reparsed. + /// + /// The method panics if the `replace` range is out of bounds. + pub fn edit(&mut self, replace: Range<usize>, with: &str) -> Range<usize> { + let start_byte = replace.start; + let start_utf16 = self.byte_to_utf16(replace.start).unwrap(); + let mut text = std::mem::take(&mut self.text).into_inner(); + text.replace_range(replace.clone(), with); + self.text = Prehashed::new(text); + + // Remove invalidated line starts. + let line = self.byte_to_line(start_byte).unwrap(); + self.lines.truncate(line + 1); + + // Handle adjoining of \r and \n. + if self.text[.. start_byte].ends_with('\r') && with.starts_with('\n') { + self.lines.pop(); + } + + // Recalculate the line starts after the edit. + self.lines + .extend(lines(start_byte, start_utf16, &self.text[start_byte ..])); + + // Incrementally reparse the replaced range. + let mut root = std::mem::take(&mut self.root).into_inner(); + let range = reparse(&mut root, &self.text, replace, with.len()); + self.root = Prehashed::new(root); + range + } + + /// Get the length of the file in UTF-8 encoded bytes. + pub fn len_bytes(&self) -> usize { + self.text.len() + } + + /// Get the length of the file in UTF-16 code units. + pub fn len_utf16(&self) -> usize { + let last = self.lines.last().unwrap(); + last.utf16_idx + self.text[last.byte_idx ..].len_utf16() + } + + /// Get the length of the file in lines. + pub fn len_lines(&self) -> usize { + self.lines.len() + } + + /// Map a span that points into this source file to a byte range. + /// + /// Panics if the span does not point into this source file. + pub fn range(&self, span: Span) -> Range<usize> { + self.root + .range(span, 0) + .expect("span does not point into this source file") + } + + /// Return the index of the UTF-16 code unit at the byte index. + pub fn byte_to_utf16(&self, byte_idx: usize) -> Option<usize> { + let line_idx = self.byte_to_line(byte_idx)?; + let line = self.lines.get(line_idx)?; + let head = self.text.get(line.byte_idx .. byte_idx)?; + Some(line.utf16_idx + head.len_utf16()) + } + + /// Return the index of the line that contains the given byte index. + pub fn byte_to_line(&self, byte_idx: usize) -> Option<usize> { + (byte_idx <= self.text.len()).then(|| { + match self.lines.binary_search_by_key(&byte_idx, |line| line.byte_idx) { + Ok(i) => i, + Err(i) => i - 1, + } + }) + } + + /// Return the index of the column at the byte index. + /// + /// The column is defined as the number of characters in the line before the + /// byte index. + pub fn byte_to_column(&self, byte_idx: usize) -> Option<usize> { + let line = self.byte_to_line(byte_idx)?; + let start = self.line_to_byte(line)?; + let head = self.get(start .. byte_idx)?; + Some(head.chars().count()) + } + + /// Return the byte index at the UTF-16 code unit. + pub fn utf16_to_byte(&self, utf16_idx: usize) -> Option<usize> { + let line = self.lines.get( + match self.lines.binary_search_by_key(&utf16_idx, |line| line.utf16_idx) { + Ok(i) => i, + Err(i) => i - 1, + }, + )?; + + let mut k = line.utf16_idx; + for (i, c) in self.text[line.byte_idx ..].char_indices() { + if k >= utf16_idx { + return Some(line.byte_idx + i); + } + k += c.len_utf16(); + } + + (k == utf16_idx).then(|| self.text.len()) + } + + + /// Return the byte position at which the given line starts. + pub fn line_to_byte(&self, line_idx: usize) -> Option<usize> { + self.lines.get(line_idx).map(|line| line.byte_idx) + } + + /// Return the range which encloses the given line. + pub fn line_to_range(&self, line_idx: usize) -> Option<Range<usize>> { + let start = self.line_to_byte(line_idx)?; + let end = self.line_to_byte(line_idx + 1).unwrap_or(self.text.len()); + Some(start .. end) + } + + /// Return the byte index of the given (line, column) pair. + /// + /// The column defines the number of characters to go beyond the start of + /// the line. + pub fn line_column_to_byte( + &self, + line_idx: usize, + column_idx: usize, + ) -> Option<usize> { + let range = self.line_to_range(line_idx)?; + let line = self.get(range.clone())?; + let mut chars = line.chars(); + for _ in 0 .. column_idx { + chars.next(); + } + Some(range.start + (line.len() - chars.as_str().len())) + } +} + +impl Debug for Source { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Source({})", self.path.display()) + } +} + +impl Hash for Source { + fn hash<H: Hasher>(&self, state: &mut H) { + self.id.hash(state); + self.path.hash(state); + self.text.hash(state); + self.root.hash(state); + } +} + +/// A unique identifier for a loaded source file. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)] +pub struct SourceId(u16); + +impl SourceId { + /// Create a new source id for a file that is not part of a store. + pub const fn detached() -> Self { + Self(u16::MAX) + } + + /// Create a source id from a number. + pub const fn from_u16(v: u16) -> Self { + Self(v) + } + + /// Extract the underlying number. + pub const fn into_u16(self) -> u16 { + self.0 + } +} + +/// Metadata about a line. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +struct Line { + /// The UTF-8 byte offset where the line starts. + byte_idx: usize, + /// The UTF-16 codepoint offset where the line starts. + utf16_idx: usize, +} + +/// Iterate over the lines in the string. +fn lines( + byte_offset: usize, + utf16_offset: usize, + text: &str, +) -> impl Iterator<Item = Line> + '_ { + let mut s = Scanner::new(text); + let mut utf16_idx = utf16_offset; + + std::iter::from_fn(move || { + s.eat_until(|c: char| { + utf16_idx += c.len_utf16(); + is_newline(c) + }); + + if s.done() { + return None; + } + + if s.eat() == Some('\r') && s.eat_if('\n') { + utf16_idx += 1; + } + + Some(Line { + byte_idx: byte_offset + s.cursor(), + utf16_idx, + }) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST: &str = "ä\tcde\nf💛g\r\nhi\rjkl"; + + #[test] + fn test_source_file_new() { + let source = Source::detached(TEST); + assert_eq!(source.lines, [ + Line { byte_idx: 0, utf16_idx: 0 }, + Line { byte_idx: 7, utf16_idx: 6 }, + Line { byte_idx: 15, utf16_idx: 12 }, + Line { byte_idx: 18, utf16_idx: 15 }, + ]); + } + + #[test] + fn test_source_file_pos_to_line() { + let source = Source::detached(TEST); + assert_eq!(source.byte_to_line(0), Some(0)); + assert_eq!(source.byte_to_line(2), Some(0)); + assert_eq!(source.byte_to_line(6), Some(0)); + assert_eq!(source.byte_to_line(7), Some(1)); + assert_eq!(source.byte_to_line(8), Some(1)); + assert_eq!(source.byte_to_line(12), Some(1)); + assert_eq!(source.byte_to_line(21), Some(3)); + assert_eq!(source.byte_to_line(22), None); + } + + #[test] + fn test_source_file_pos_to_column() { + let source = Source::detached(TEST); + assert_eq!(source.byte_to_column(0), Some(0)); + assert_eq!(source.byte_to_column(2), Some(1)); + assert_eq!(source.byte_to_column(6), Some(5)); + assert_eq!(source.byte_to_column(7), Some(0)); + assert_eq!(source.byte_to_column(8), Some(1)); + assert_eq!(source.byte_to_column(12), Some(2)); + } + + #[test] + fn test_source_file_utf16() { + #[track_caller] + fn roundtrip(source: &Source, byte_idx: usize, utf16_idx: usize) { + let middle = source.byte_to_utf16(byte_idx).unwrap(); + let result = source.utf16_to_byte(middle).unwrap(); + assert_eq!(middle, utf16_idx); + assert_eq!(result, byte_idx); + } + + let source = Source::detached(TEST); + roundtrip(&source, 0, 0); + roundtrip(&source, 2, 1); + roundtrip(&source, 3, 2); + roundtrip(&source, 8, 7); + roundtrip(&source, 12, 9); + roundtrip(&source, 21, 18); + assert_eq!(source.byte_to_utf16(22), None); + assert_eq!(source.utf16_to_byte(19), None); + } + + #[test] + fn test_source_file_roundtrip() { + #[track_caller] + fn roundtrip(source: &Source, byte_idx: usize) { + let line = source.byte_to_line(byte_idx).unwrap(); + let column = source.byte_to_column(byte_idx).unwrap(); + let result = source.line_column_to_byte(line, column).unwrap(); + assert_eq!(result, byte_idx); + } + + let source = Source::detached(TEST); + roundtrip(&source, 0); + roundtrip(&source, 7); + roundtrip(&source, 12); + roundtrip(&source, 21); + } + + #[test] + fn test_source_file_edit() { + #[track_caller] + fn test(prev: &str, range: Range<usize>, with: &str, after: &str) { + let mut source = Source::detached(prev); + let result = Source::detached(after); + source.edit(range, with); + assert_eq!(source.text, result.text); + assert_eq!(source.lines, result.lines); + assert_eq!(*source.root, *result.root); + } + + // Test inserting at the begining. + test("abc\n", 0 .. 0, "hi\n", "hi\nabc\n"); + test("\nabc", 0 .. 0, "hi\r", "hi\r\nabc"); + + // Test editing in the middle. + test(TEST, 4 .. 16, "❌", "ä\tc❌i\rjkl"); + + // Test appending. + test("abc\ndef", 7 .. 7, "hi", "abc\ndefhi"); + test("abc\ndef\n", 8 .. 8, "hi", "abc\ndef\nhi"); + + // Test appending with adjoining \r and \n. + test("abc\ndef\r", 8 .. 8, "\nghi", "abc\ndef\r\nghi"); + + // Test removing everything. + test(TEST, 0 .. 21, "", ""); + } +} diff --git a/src/syntax/span.rs b/src/syntax/span.rs index 744aa123..d4d9a8f6 100644 --- a/src/syntax/span.rs +++ b/src/syntax/span.rs @@ -2,7 +2,7 @@ use std::fmt::{self, Debug, Display, Formatter}; use std::num::NonZeroU64; use std::ops::Range; -use crate::syntax::SourceId; +use super::SourceId; /// A value with a span locating it in the source code. #[derive(Copy, Clone, Eq, PartialEq, Hash)] @@ -42,8 +42,8 @@ impl<T: Debug> Debug for Spanned<T> { /// A unique identifier for a syntax node. /// /// This is used throughout the compiler to track which source section an error -/// or element stems from. Can be [mapped back](crate::source::Source::range) -/// to a byte range for user facing display. +/// or element stems from. Can be [mapped back](super::Source::range) to a byte +/// range for user facing display. /// /// Span ids are ordered in the tree to enable quickly finding the node with /// some id: diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs new file mode 100644 index 00000000..8e1b6944 --- /dev/null +++ b/src/syntax/tokens.rs @@ -0,0 +1,1176 @@ +use std::sync::Arc; + +use unicode_xid::UnicodeXID; +use unscanny::Scanner; + +use super::resolve::{resolve_hex, resolve_raw, resolve_string}; +use super::{ErrorPos, NodeKind, RawKind, Unit}; +use crate::geom::{AngleUnit, LengthUnit}; +use crate::util::EcoString; + +/// An iterator over the tokens of a string of source code. +#[derive(Clone)] +pub struct Tokens<'s> { + /// The underlying scanner. + s: Scanner<'s>, + /// The mode the scanner is in. This determines what tokens it recognizes. + mode: TokenMode, + /// Whether the last token has been terminated. + terminated: bool, + /// Offsets the indentation on the first line of the source. + column_offset: usize, +} + +/// What kind of tokens to emit. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum TokenMode { + /// Text and markup. + Markup, + /// Math atoms, operators, etc. + Math, + /// Keywords, literals and operators. + Code, +} + +impl<'s> Tokens<'s> { + /// Create a new token iterator with the given mode. + #[inline] + pub fn new(text: &'s str, mode: TokenMode) -> Self { + Self::with_prefix("", text, mode) + } + + /// Create a new token iterator with the given mode and a prefix to offset + /// column calculations. + #[inline] + pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self { + Self { + s: Scanner::new(text), + mode, + terminated: true, + column_offset: column(prefix, prefix.len(), 0), + } + } + + /// Get the current token mode. + #[inline] + pub fn mode(&self) -> TokenMode { + self.mode + } + + /// Change the token mode. + #[inline] + pub fn set_mode(&mut self, mode: TokenMode) { + self.mode = mode; + } + + /// The index in the string at which the last token ends and next token + /// will start. + #[inline] + pub fn cursor(&self) -> usize { + self.s.cursor() + } + + /// Jump to the given index in the string. + #[inline] + pub fn jump(&mut self, index: usize) { + self.s.jump(index); + } + + /// The underlying scanner. + #[inline] + pub fn scanner(&self) -> Scanner<'s> { + self.s + } + + /// Whether the last token was terminated. + #[inline] + pub fn terminated(&self) -> bool { + self.terminated + } + + /// The column index of a given index in the source string. + #[inline] + pub fn column(&self, index: usize) -> usize { + column(self.s.string(), index, self.column_offset) + } +} + +impl<'s> Iterator for Tokens<'s> { + type Item = NodeKind; + + /// Parse the next token in the source code. + #[inline] + fn next(&mut self) -> Option<Self::Item> { + let start = self.s.cursor(); + let c = self.s.eat()?; + Some(match c { + // Trivia. + '/' if self.s.eat_if('/') => self.line_comment(), + '/' if self.s.eat_if('*') => self.block_comment(), + '*' if self.s.eat_if('/') => { + NodeKind::Error(ErrorPos::Full, "unexpected end of block comment".into()) + } + c if c.is_whitespace() => self.whitespace(c), + + // Other things. + _ => match self.mode { + TokenMode::Markup => self.markup(start, c), + TokenMode::Math => self.math(start, c), + TokenMode::Code => self.code(start, c), + }, + }) + } +} + +impl<'s> Tokens<'s> { + fn line_comment(&mut self) -> NodeKind { + self.s.eat_until(is_newline); + if self.s.peek().is_none() { + self.terminated = false; + } + NodeKind::LineComment + } + + fn block_comment(&mut self) -> NodeKind { + let mut state = '_'; + let mut depth = 1; + self.terminated = false; + + // Find the first `*/` that does not correspond to a nested `/*`. + while let Some(c) = self.s.eat() { + state = match (state, c) { + ('*', '/') => { + depth -= 1; + if depth == 0 { + self.terminated = true; + break; + } + '_' + } + ('/', '*') => { + depth += 1; + '_' + } + ('/', '/') => { + self.line_comment(); + '_' + } + _ => c, + } + } + + NodeKind::BlockComment + } + + fn whitespace(&mut self, c: char) -> NodeKind { + if c == ' ' && !self.s.at(char::is_whitespace) { + return NodeKind::Space { newlines: 0 }; + } + + self.s.uneat(); + + // Count the number of newlines. + let mut newlines = 0; + while let Some(c) = self.s.eat() { + if !c.is_whitespace() { + self.s.uneat(); + break; + } + + if is_newline(c) { + if c == '\r' { + self.s.eat_if('\n'); + } + newlines += 1; + } + } + + NodeKind::Space { newlines } + } + + #[inline] + fn markup(&mut self, start: usize, c: char) -> NodeKind { + match c { + // Blocks. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + + // Multi-char things. + '#' => self.hash(start), + '.' if self.s.eat_if("..") => NodeKind::Shorthand('\u{2026}'), + '-' => self.hyph(), + 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => { + self.link(start) + } + '`' => self.raw(), + c if c.is_ascii_digit() => self.numbering(start), + '<' => self.label(), + '@' => self.reference(start), + + // Escape sequences. + '\\' => self.backslash(), + + // Single-char things. + '~' => NodeKind::Shorthand('\u{00A0}'), + '\'' => NodeKind::SmartQuote { double: false }, + '"' => NodeKind::SmartQuote { double: true }, + '*' if !self.in_word() => NodeKind::Star, + '_' if !self.in_word() => NodeKind::Underscore, + '$' => NodeKind::Dollar, + '=' => NodeKind::Eq, + '+' => NodeKind::Plus, + '/' => NodeKind::Slash, + ':' => NodeKind::Colon, + + // Plain text. + _ => self.text(start), + } + } + + #[inline] + fn text(&mut self, start: usize) -> NodeKind { + macro_rules! table { + ($(|$c:literal)*) => {{ + let mut t = [false; 128]; + $(t[$c as usize] = true;)* + t + }} + } + + const TABLE: [bool; 128] = table! { + | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' + | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' + | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' + }; + + loop { + self.s.eat_until(|c: char| { + TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) + }); + + // Continue with the same text node if the thing would become text + // anyway. + let mut s = self.s; + match s.eat() { + Some('/') if !s.at(['/', '*']) => {} + Some(' ') if s.at(char::is_alphanumeric) => {} + Some('-') if !s.at(['-', '?']) => {} + Some('.') if !s.at("..") => {} + Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} + Some('@' | '#') if !s.at(is_id_start) => {} + _ => break, + } + + self.s = s; + } + + NodeKind::Text(self.s.from(start).into()) + } + + fn backslash(&mut self) -> NodeKind { + match self.s.peek() { + Some('u') if self.s.eat_if("u{") => { + let sequence = self.s.eat_while(char::is_ascii_alphanumeric); + if self.s.eat_if('}') { + if let Some(c) = resolve_hex(sequence) { + NodeKind::Escape(c) + } else { + NodeKind::Error( + ErrorPos::Full, + "invalid unicode escape sequence".into(), + ) + } + } else { + self.terminated = false; + NodeKind::Error(ErrorPos::End, "expected closing brace".into()) + } + } + + // Linebreaks. + Some(c) if c.is_whitespace() => NodeKind::Linebreak, + None => NodeKind::Linebreak, + + // Escapes. + Some(c) => { + self.s.expect(c); + NodeKind::Escape(c) + } + } + } + + fn hash(&mut self, start: usize) -> NodeKind { + if self.s.at(is_id_start) { + let read = self.s.eat_while(is_id_continue); + match keyword(read) { + Some(keyword) => keyword, + None => NodeKind::Ident(read.into()), + } + } else { + self.text(start) + } + } + + fn hyph(&mut self) -> NodeKind { + if self.s.eat_if('-') { + if self.s.eat_if('-') { + NodeKind::Shorthand('\u{2014}') + } else { + NodeKind::Shorthand('\u{2013}') + } + } else if self.s.eat_if('?') { + NodeKind::Shorthand('\u{00AD}') + } else { + NodeKind::Minus + } + } + + fn link(&mut self, start: usize) -> NodeKind { + #[rustfmt::skip] + self.s.eat_while(|c: char| matches!(c, + | '0' ..= '9' + | 'a' ..= 'z' + | 'A' ..= 'Z' + | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' + | '\'' | '.' | ',' | ';' + )); + if self.s.scout(-1) == Some('.') { + self.s.uneat(); + } + NodeKind::Link(self.s.from(start).into()) + } + + fn raw(&mut self) -> NodeKind { + let column = self.column(self.s.cursor() - 1); + + let mut backticks = 1; + while self.s.eat_if('`') { + backticks += 1; + } + + // Special case for empty inline block. + if backticks == 2 { + return NodeKind::Raw(Arc::new(RawKind { + text: EcoString::new(), + lang: None, + block: false, + })); + } + + let start = self.s.cursor(); + let mut found = 0; + while found < backticks { + match self.s.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, + } + } + + if found == backticks { + let end = self.s.cursor() - found as usize; + NodeKind::Raw(Arc::new(resolve_raw( + column, + backticks, + self.s.get(start .. end), + ))) + } else { + self.terminated = false; + let remaining = backticks - found; + let noun = if remaining == 1 { "backtick" } else { "backticks" }; + NodeKind::Error( + ErrorPos::End, + if found == 0 { + format_eco!("expected {} {}", remaining, noun) + } else { + format_eco!("expected {} more {}", remaining, noun) + }, + ) + } + } + + fn numbering(&mut self, start: usize) -> NodeKind { + self.s.eat_while(char::is_ascii_digit); + let read = self.s.from(start); + if self.s.eat_if('.') { + if let Ok(number) = read.parse() { + return NodeKind::EnumNumbering(number); + } + } + + self.text(start) + } + + fn label(&mut self) -> NodeKind { + let label = self.s.eat_while(is_id_continue); + if self.s.eat_if('>') { + if !label.is_empty() { + NodeKind::Label(label.into()) + } else { + NodeKind::Error(ErrorPos::Full, "label cannot be empty".into()) + } + } else { + self.terminated = false; + NodeKind::Error(ErrorPos::End, "expected closing angle bracket".into()) + } + } + + fn reference(&mut self, start: usize) -> NodeKind { + let label = self.s.eat_while(is_id_continue); + if !label.is_empty() { + NodeKind::Ref(label.into()) + } else { + self.text(start) + } + } + + fn math(&mut self, start: usize, c: char) -> NodeKind { + match c { + // Escape sequences. + '\\' => self.backslash(), + + // Single-char things. + '_' => NodeKind::Underscore, + '^' => NodeKind::Hat, + '/' => NodeKind::Slash, + '&' => NodeKind::Amp, + '$' => NodeKind::Dollar, + + // Brackets. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + '(' => NodeKind::LeftParen, + ')' => NodeKind::RightParen, + + // Identifiers. + c if is_math_id_start(c) && self.s.at(is_math_id_continue) => { + self.s.eat_while(is_math_id_continue); + NodeKind::Ident(self.s.from(start).into()) + } + + // Numbers. + c if c.is_numeric() => { + self.s.eat_while(char::is_numeric); + NodeKind::Atom(self.s.from(start).into()) + } + + // Other math atoms. + c => NodeKind::Atom(c.into()), + } + } + + fn code(&mut self, start: usize, c: char) -> NodeKind { + match c { + // Blocks. + '{' => NodeKind::LeftBrace, + '}' => NodeKind::RightBrace, + '[' => NodeKind::LeftBracket, + ']' => NodeKind::RightBracket, + + // Parentheses. + '(' => NodeKind::LeftParen, + ')' => NodeKind::RightParen, + + // Two-char operators. + '=' if self.s.eat_if('=') => NodeKind::EqEq, + '!' if self.s.eat_if('=') => NodeKind::ExclEq, + '<' if self.s.eat_if('=') => NodeKind::LtEq, + '>' if self.s.eat_if('=') => NodeKind::GtEq, + '+' if self.s.eat_if('=') => NodeKind::PlusEq, + '-' if self.s.eat_if('=') => NodeKind::HyphEq, + '*' if self.s.eat_if('=') => NodeKind::StarEq, + '/' if self.s.eat_if('=') => NodeKind::SlashEq, + '.' if self.s.eat_if('.') => NodeKind::Dots, + '=' if self.s.eat_if('>') => NodeKind::Arrow, + + // Single-char operators. + ',' => NodeKind::Comma, + ';' => NodeKind::Semicolon, + ':' => NodeKind::Colon, + '+' => NodeKind::Plus, + '-' => NodeKind::Minus, + '*' => NodeKind::Star, + '/' => NodeKind::Slash, + '=' => NodeKind::Eq, + '<' => NodeKind::Lt, + '>' => NodeKind::Gt, + '.' if !self.s.at(char::is_ascii_digit) => NodeKind::Dot, + + // Identifiers. + c if is_id_start(c) => self.ident(start), + + // Numbers. + c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => { + self.number(start, c) + } + + // Strings. + '"' => self.string(), + + // Invalid token. + _ => NodeKind::Error(ErrorPos::Full, "not valid here".into()), + } + } + + fn ident(&mut self, start: usize) -> NodeKind { + self.s.eat_while(is_id_continue); + match self.s.from(start) { + "none" => NodeKind::None, + "auto" => NodeKind::Auto, + "true" => NodeKind::Bool(true), + "false" => NodeKind::Bool(false), + id => keyword(id).unwrap_or_else(|| NodeKind::Ident(id.into())), + } + } + + fn number(&mut self, start: usize, c: char) -> NodeKind { + // Read the first part (integer or fractional depending on `first`). + self.s.eat_while(char::is_ascii_digit); + + // Read the fractional part if not already done. + // Make sure not to confuse a range for the decimal separator. + if c != '.' && !self.s.at("..") && self.s.eat_if('.') { + self.s.eat_while(char::is_ascii_digit); + } + + // Read the exponent. + if !self.s.at("em") && self.s.eat_if(['e', 'E']) { + self.s.eat_if(['+', '-']); + self.s.eat_while(char::is_ascii_digit); + } + + // Read the suffix. + let suffix_start = self.s.cursor(); + if !self.s.eat_if('%') { + self.s.eat_while(char::is_ascii_alphanumeric); + } + + let number = self.s.get(start .. suffix_start); + let suffix = self.s.from(suffix_start); + + // Find out whether it is a simple number. + if suffix.is_empty() { + if let Ok(i) = number.parse::<i64>() { + return NodeKind::Int(i); + } + } + + let v = match number.parse::<f64>() { + Ok(v) => v, + Err(_) => return NodeKind::Error(ErrorPos::Full, "invalid number".into()), + }; + + match suffix { + "" => NodeKind::Float(v), + "pt" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Pt)), + "mm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Mm)), + "cm" => NodeKind::Numeric(v, Unit::Length(LengthUnit::Cm)), + "in" => NodeKind::Numeric(v, Unit::Length(LengthUnit::In)), + "deg" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Deg)), + "rad" => NodeKind::Numeric(v, Unit::Angle(AngleUnit::Rad)), + "em" => NodeKind::Numeric(v, Unit::Em), + "fr" => NodeKind::Numeric(v, Unit::Fr), + "%" => NodeKind::Numeric(v, Unit::Percent), + _ => NodeKind::Error(ErrorPos::Full, "invalid number suffix".into()), + } + } + + fn string(&mut self) -> NodeKind { + let mut escaped = false; + let verbatim = self.s.eat_until(|c| { + if c == '"' && !escaped { + true + } else { + escaped = c == '\\' && !escaped; + false + } + }); + + let string = resolve_string(verbatim); + if self.s.eat_if('"') { + NodeKind::Str(string) + } else { + self.terminated = false; + NodeKind::Error(ErrorPos::End, "expected quote".into()) + } + } + + fn in_word(&self) -> bool { + let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); + let prev = self.s.scout(-2); + let next = self.s.peek(); + alphanumeric(prev) && alphanumeric(next) + } +} + +fn keyword(ident: &str) -> Option<NodeKind> { + Some(match ident { + "not" => NodeKind::Not, + "and" => NodeKind::And, + "or" => NodeKind::Or, + "let" => NodeKind::Let, + "set" => NodeKind::Set, + "show" => NodeKind::Show, + "wrap" => NodeKind::Wrap, + "if" => NodeKind::If, + "else" => NodeKind::Else, + "for" => NodeKind::For, + "in" => NodeKind::In, + "as" => NodeKind::As, + "while" => NodeKind::While, + "break" => NodeKind::Break, + "continue" => NodeKind::Continue, + "return" => NodeKind::Return, + "import" => NodeKind::Import, + "include" => NodeKind::Include, + "from" => NodeKind::From, + _ => return None, + }) +} + +/// The column index of a given index in the source string, given a column +/// offset for the first line. +#[inline] +fn column(string: &str, index: usize, offset: usize) -> usize { + let mut apply_offset = false; + let res = string[.. index] + .char_indices() + .rev() + .take_while(|&(_, c)| !is_newline(c)) + .inspect(|&(i, _)| { + if i == 0 { + apply_offset = true + } + }) + .count(); + + // The loop is never executed if the slice is empty, but we are of + // course still at the start of the first line. + if index == 0 { + apply_offset = true; + } + + if apply_offset { res + offset } else { res } +} + +/// Whether this character denotes a newline. +#[inline] +pub fn is_newline(character: char) -> bool { + matches!( + character, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' + ) +} + +/// Whether a string is a valid unicode identifier. +/// +/// In addition to what is specified in the [Unicode Standard][uax31], we allow: +/// - `_` as a starting character, +/// - `_` and `-` as continuing characters. +/// +/// [uax31]: http://www.unicode.org/reports/tr31/ +#[inline] +pub fn is_ident(string: &str) -> bool { + let mut chars = string.chars(); + chars + .next() + .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue)) +} + +/// Whether a character can start an identifier. +#[inline] +fn is_id_start(c: char) -> bool { + c.is_xid_start() || c == '_' +} + +/// Whether a character can continue an identifier. +#[inline] +fn is_id_continue(c: char) -> bool { + c.is_xid_continue() || c == '_' || c == '-' +} + +/// Whether a character can start an identifier in math. +#[inline] +fn is_math_id_start(c: char) -> bool { + c.is_xid_start() +} + +/// Whether a character can continue an identifier in math. +#[inline] +fn is_math_id_continue(c: char) -> bool { + c.is_xid_continue() && c != '_' +} + +#[cfg(test)] +#[allow(non_snake_case)] +mod tests { + use super::super::tests::check; + use super::*; + + use ErrorPos::*; + use NodeKind::*; + use Option::None; + use TokenMode::{Code, Markup}; + + fn Space(newlines: usize) -> NodeKind { + NodeKind::Space { newlines } + } + + fn Raw(text: &str, lang: Option<&str>, block: bool) -> NodeKind { + NodeKind::Raw(Arc::new(RawKind { + text: text.into(), + lang: lang.map(Into::into), + block, + })) + } + + fn Str(string: &str) -> NodeKind { + NodeKind::Str(string.into()) + } + + fn Text(string: &str) -> NodeKind { + NodeKind::Text(string.into()) + } + + fn Ident(ident: &str) -> NodeKind { + NodeKind::Ident(ident.into()) + } + + fn Error(pos: ErrorPos, message: &str) -> NodeKind { + NodeKind::Error(pos, message.into()) + } + + /// Building blocks for suffix testing. + /// + /// We extend each test case with a collection of different suffixes to make + /// sure tokens end at the correct position. These suffixes are split into + /// blocks, which can be disabled/enabled per test case. For example, when + /// testing identifiers we disable letter suffixes because these would + /// mingle with the identifiers. + /// + /// Suffix blocks: + /// - ' ': spacing + /// - 'a': letters + /// - '1': numbers + /// - '/': symbols + const BLOCKS: &str = " a1/"; + + // Suffixes described by four-tuples of: + // + // - block the suffix is part of + // - mode in which the suffix is applicable + // - the suffix string + // - the resulting suffix NodeKind + fn suffixes() + -> impl Iterator<Item = (char, Option<TokenMode>, &'static str, NodeKind)> { + [ + // Whitespace suffixes. + (' ', None, " ", Space(0)), + (' ', None, "\n", Space(1)), + (' ', None, "\r", Space(1)), + (' ', None, "\r\n", Space(1)), + // Letter suffixes. + ('a', Some(Markup), "hello", Text("hello")), + ('a', Some(Markup), "💚", Text("💚")), + ('a', Some(Code), "val", Ident("val")), + ('a', Some(Code), "α", Ident("α")), + ('a', Some(Code), "_", Ident("_")), + // Number suffixes. + ('1', Some(Code), "2", Int(2)), + ('1', Some(Code), ".2", Float(0.2)), + // Symbol suffixes. + ('/', None, "[", LeftBracket), + ('/', None, "//", LineComment), + ('/', None, "/**/", BlockComment), + ('/', Some(Markup), "*", Star), + ('/', Some(Markup), r"\\", Escape('\\')), + ('/', Some(Markup), "#let", Let), + ('/', Some(Code), "(", LeftParen), + ('/', Some(Code), ":", Colon), + ('/', Some(Code), "+=", PlusEq), + ] + .into_iter() + } + + macro_rules! t { + (Both $($tts:tt)*) => { + t!(Markup $($tts)*); + t!(Code $($tts)*); + }; + ($mode:ident $([$blocks:literal])?: $text:expr => $($token:expr),*) => {{ + // Test without suffix. + t!(@$mode: $text => $($token),*); + + // Test with each applicable suffix. + for (block, mode, suffix, ref token) in suffixes() { + let text = $text; + #[allow(unused_variables)] + let blocks = BLOCKS; + $(let blocks = $blocks;)? + assert!(!blocks.contains(|c| !BLOCKS.contains(c))); + if (mode.is_none() || mode == Some($mode)) && blocks.contains(block) { + t!(@$mode: format!("{}{}", text, suffix) => $($token,)* token); + } + } + }}; + (@$mode:ident: $text:expr => $($token:expr),*) => {{ + let text = $text; + let found = Tokens::new(&text, $mode).collect::<Vec<_>>(); + let expected = vec![$($token.clone()),*]; + check(&text, found, expected); + }}; + } + + #[test] + fn test_tokenize_brackets() { + // Test in markup. + t!(Markup: "{" => LeftBrace); + t!(Markup: "}" => RightBrace); + t!(Markup: "[" => LeftBracket); + t!(Markup: "]" => RightBracket); + t!(Markup[" /"]: "(" => Text("(")); + t!(Markup[" /"]: ")" => Text(")")); + + // Test in code. + t!(Code: "{" => LeftBrace); + t!(Code: "}" => RightBrace); + t!(Code: "[" => LeftBracket); + t!(Code: "]" => RightBracket); + t!(Code: "(" => LeftParen); + t!(Code: ")" => RightParen); + } + + #[test] + fn test_tokenize_whitespace() { + // Test basic whitespace. + t!(Both["a1/"]: "" => ); + t!(Both["a1/"]: " " => Space(0)); + t!(Both["a1/"]: " " => Space(0)); + t!(Both["a1/"]: "\t" => Space(0)); + t!(Both["a1/"]: " \t" => Space(0)); + t!(Both["a1/"]: "\u{202F}" => Space(0)); + + // Test newline counting. + t!(Both["a1/"]: "\n" => Space(1)); + t!(Both["a1/"]: "\n " => Space(1)); + t!(Both["a1/"]: " \n" => Space(1)); + t!(Both["a1/"]: " \n " => Space(1)); + t!(Both["a1/"]: "\r\n" => Space(1)); + t!(Both["a1/"]: "\r\n\r" => Space(2)); + t!(Both["a1/"]: " \n\t \n " => Space(2)); + t!(Both["a1/"]: "\n\r" => Space(2)); + t!(Both["a1/"]: " \r\r\n \x0D" => Space(3)); + } + + #[test] + fn test_tokenize_text() { + // Test basic text. + t!(Markup[" /"]: "hello" => Text("hello")); + t!(Markup[" /"]: "reha-world" => Text("reha-world")); + + // Test code symbols in text. + t!(Markup[" /"]: "a():\"b" => Text("a()"), Colon, SmartQuote { double: true }, Text("b")); + t!(Markup[" /"]: ";,|/+" => Text(";,|/+")); + t!(Markup[" /"]: "=-a" => Eq, Minus, Text("a")); + t!(Markup[" "]: "#123" => Text("#123")); + + // Test text ends. + t!(Markup[""]: "hello " => Text("hello"), Space(0)); + t!(Markup[""]: "hello~" => Text("hello"), Shorthand('\u{00A0}')); + } + + #[test] + fn test_tokenize_escape_sequences() { + // Test escapable symbols. + t!(Markup: r"\\" => Escape('\\')); + t!(Markup: r"\/" => Escape('/')); + t!(Markup: r"\[" => Escape('[')); + t!(Markup: r"\]" => Escape(']')); + t!(Markup: r"\{" => Escape('{')); + t!(Markup: r"\}" => Escape('}')); + t!(Markup: r"\*" => Escape('*')); + t!(Markup: r"\_" => Escape('_')); + t!(Markup: r"\=" => Escape('=')); + t!(Markup: r"\~" => Escape('~')); + t!(Markup: r"\'" => Escape('\'')); + t!(Markup: r#"\""# => Escape('"')); + t!(Markup: r"\`" => Escape('`')); + t!(Markup: r"\$" => Escape('$')); + t!(Markup: r"\#" => Escape('#')); + t!(Markup: r"\a" => Escape('a')); + t!(Markup: r"\u" => Escape('u')); + t!(Markup: r"\1" => Escape('1')); + + // Test basic unicode escapes. + t!(Markup: r"\u{}" => Error(Full, "invalid unicode escape sequence")); + t!(Markup: r"\u{2603}" => Escape('☃')); + t!(Markup: r"\u{P}" => Error(Full, "invalid unicode escape sequence")); + + // Test unclosed unicode escapes. + t!(Markup[" /"]: r"\u{" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{26A4" => Error(End, "expected closing brace")); + t!(Markup[" /"]: r"\u{1Q3P" => Error(End, "expected closing brace")); + t!(Markup: r"\u{1🏕}" => Error(End, "expected closing brace"), Text("🏕"), RightBrace); + } + + #[test] + fn test_tokenize_markup_symbols() { + // Test markup tokens. + t!(Markup[" a1"]: "*" => Star); + t!(Markup: "_" => Underscore); + t!(Markup[""]: "===" => Eq, Eq, Eq); + t!(Markup["a1/"]: "= " => Eq, Space(0)); + t!(Markup[" "]: r"\" => Linebreak); + t!(Markup: "~" => Shorthand('\u{00A0}')); + t!(Markup["a1/"]: "-?" => Shorthand('\u{00AD}')); + t!(Markup["a "]: r"a--" => Text("a"), Shorthand('\u{2013}')); + t!(Markup["a1/"]: "- " => Minus, Space(0)); + t!(Markup[" "]: "+" => Plus); + t!(Markup[" "]: "1." => EnumNumbering(1)); + t!(Markup[" "]: "1.a" => EnumNumbering(1), Text("a")); + t!(Markup[" /"]: "a1." => Text("a1.")); + } + + #[test] + fn test_tokenize_code_symbols() { + // Test all symbols. + t!(Code: "," => Comma); + t!(Code: ";" => Semicolon); + t!(Code: ":" => Colon); + t!(Code: "+" => Plus); + t!(Code: "-" => Minus); + t!(Code[" a1"]: "*" => Star); + t!(Code[" a1"]: "/" => Slash); + t!(Code[" a/"]: "." => Dot); + t!(Code: "=" => Eq); + t!(Code: "==" => EqEq); + t!(Code: "!=" => ExclEq); + t!(Code: "<" => Lt); + t!(Code: "<=" => LtEq); + t!(Code: ">" => Gt); + t!(Code: ">=" => GtEq); + t!(Code: "+=" => PlusEq); + t!(Code: "-=" => HyphEq); + t!(Code: "*=" => StarEq); + t!(Code: "/=" => SlashEq); + t!(Code: ".." => Dots); + t!(Code: "=>" => Arrow); + + // Test combinations. + t!(Code: "<=>" => LtEq, Gt); + t!(Code[" a/"]: "..." => Dots, Dot); + + // Test hyphen as symbol vs part of identifier. + t!(Code[" /"]: "-1" => Minus, Int(1)); + t!(Code[" /"]: "-a" => Minus, Ident("a")); + t!(Code[" /"]: "--1" => Minus, Minus, Int(1)); + t!(Code[" /"]: "--_a" => Minus, Minus, Ident("_a")); + t!(Code[" /"]: "a-b" => Ident("a-b")); + + // Test invalid. + t!(Code: r"\" => Error(Full, "not valid here")); + } + + #[test] + fn test_tokenize_keywords() { + // A list of a few (not all) keywords. + let list = [ + ("not", Not), + ("let", Let), + ("if", If), + ("else", Else), + ("for", For), + ("in", In), + ("import", Import), + ]; + + for (s, t) in list.clone() { + t!(Markup[" "]: format!("#{}", s) => t); + t!(Markup[" "]: format!("#{0}#{0}", s) => t, t); + t!(Markup[" /"]: format!("# {}", s) => Text(&format!("# {s}"))); + } + + for (s, t) in list { + t!(Code[" "]: s => t); + t!(Markup[" /"]: s => Text(s)); + } + + // Test simple identifier. + t!(Markup[" "]: "#letter" => Ident("letter")); + t!(Code[" /"]: "falser" => Ident("falser")); + t!(Code[" /"]: "None" => Ident("None")); + t!(Code[" /"]: "True" => Ident("True")); + } + + #[test] + fn test_tokenize_raw_blocks() { + // Test basic raw block. + t!(Markup: "``" => Raw("", None, false)); + t!(Markup: "`raw`" => Raw("raw", None, false)); + t!(Markup[""]: "`]" => Error(End, "expected 1 backtick")); + + // Test special symbols in raw block. + t!(Markup: "`[brackets]`" => Raw("[brackets]", None, false)); + t!(Markup[""]: r"`\`` " => Raw(r"\", None, false), Error(End, "expected 1 backtick")); + + // Test separated closing backticks. + t!(Markup: "```not `y`e`t```" => Raw("`y`e`t", Some("not"), false)); + + // Test more backticks. + t!(Markup: "``nope``" => Raw("", None, false), Text("nope"), Raw("", None, false)); + t!(Markup: "````🚀````" => Raw("", None, false)); + t!(Markup[""]: "`````👩🚀````noend" => Error(End, "expected 5 backticks")); + t!(Markup[""]: "````raw``````" => Raw("", Some("raw"), false), Raw("", None, false)); + } + + #[test] + fn test_tokenize_idents() { + // Test valid identifiers. + t!(Code[" /"]: "x" => Ident("x")); + t!(Code[" /"]: "value" => Ident("value")); + t!(Code[" /"]: "__main__" => Ident("__main__")); + t!(Code[" /"]: "_snake_case" => Ident("_snake_case")); + + // Test non-ascii. + t!(Code[" /"]: "α" => Ident("α")); + t!(Code[" /"]: "ម្តាយ" => Ident("ម្តាយ")); + + // Test hyphen parsed as identifier. + t!(Code[" /"]: "kebab-case" => Ident("kebab-case")); + t!(Code[" /"]: "one-10" => Ident("one-10")); + } + + #[test] + fn test_tokenize_numeric() { + let ints = [("7", 7), ("012", 12)]; + let floats = [ + (".3", 0.3), + ("0.3", 0.3), + ("3.", 3.0), + ("3.0", 3.0), + ("14.3", 14.3), + ("10e2", 1000.0), + ("10e+0", 10.0), + ("10e+1", 100.0), + ("10e-2", 0.1), + ("10.e1", 100.0), + ("10.e-1", 1.0), + (".1e1", 1.0), + ("10E2", 1000.0), + ]; + + // Test integers. + for &(s, v) in &ints { + t!(Code[" /"]: s => Int(v)); + } + + // Test floats. + for &(s, v) in &floats { + t!(Code[" /"]: s => Float(v)); + } + + // Test attached numbers. + t!(Code[" /"]: ".2.3" => Float(0.2), Float(0.3)); + t!(Code[" /"]: "1.2.3" => Float(1.2), Float(0.3)); + t!(Code[" /"]: "1e-2+3" => Float(0.01), Plus, Int(3)); + + // Test float from too large integer. + let large = i64::MAX as f64 + 1.0; + t!(Code[" /"]: large.to_string() => Float(large)); + + // Combined integers and floats. + let nums = ints.iter().map(|&(k, v)| (k, v as f64)).chain(floats); + + let suffixes: &[(&str, fn(f64) -> NodeKind)] = &[ + ("mm", |x| Numeric(x, Unit::Length(LengthUnit::Mm))), + ("pt", |x| Numeric(x, Unit::Length(LengthUnit::Pt))), + ("cm", |x| Numeric(x, Unit::Length(LengthUnit::Cm))), + ("in", |x| Numeric(x, Unit::Length(LengthUnit::In))), + ("rad", |x| Numeric(x, Unit::Angle(AngleUnit::Rad))), + ("deg", |x| Numeric(x, Unit::Angle(AngleUnit::Deg))), + ("em", |x| Numeric(x, Unit::Em)), + ("fr", |x| Numeric(x, Unit::Fr)), + ("%", |x| Numeric(x, Unit::Percent)), + ]; + + // Numeric types. + for &(suffix, build) in suffixes { + for (s, v) in nums.clone() { + t!(Code[" /"]: format!("{}{}", s, suffix) => build(v)); + } + } + + // Multiple dots close the number. + t!(Code[" /"]: "1..2" => Int(1), Dots, Int(2)); + t!(Code[" /"]: "1..2.3" => Int(1), Dots, Float(2.3)); + t!(Code[" /"]: "1.2..3" => Float(1.2), Dots, Int(3)); + + // Test invalid. + t!(Code[" /"]: "1foo" => Error(Full, "invalid number suffix")); + } + + #[test] + fn test_tokenize_strings() { + // Test basic strings. + t!(Code: "\"hi\"" => Str("hi")); + t!(Code: "\"hi\nthere\"" => Str("hi\nthere")); + t!(Code: "\"🌎\"" => Str("🌎")); + + // Test unterminated. + t!(Code[""]: "\"hi" => Error(End, "expected quote")); + + // Test escaped quote. + t!(Code: r#""a\"bc""# => Str("a\"bc")); + t!(Code[""]: r#""\""# => Error(End, "expected quote")); + } + + #[test] + fn test_tokenize_line_comments() { + // Test line comment with no trailing newline. + t!(Both[""]: "//" => LineComment); + + // Test line comment ends at newline. + t!(Both["a1/"]: "//bc\n" => LineComment, Space(1)); + t!(Both["a1/"]: "// bc \n" => LineComment, Space(1)); + t!(Both["a1/"]: "//bc\r\n" => LineComment, Space(1)); + + // Test nested line comments. + t!(Both["a1/"]: "//a//b\n" => LineComment, Space(1)); + } + + #[test] + fn test_tokenize_block_comments() { + // Test basic block comments. + t!(Both[""]: "/*" => BlockComment); + t!(Both: "/**/" => BlockComment); + t!(Both: "/*🏞*/" => BlockComment); + t!(Both: "/*\n*/" => BlockComment); + + // Test depth 1 and 2 nested block comments. + t!(Both: "/* /* */ */" => BlockComment); + t!(Both: "/*/*/**/*/*/" => BlockComment); + + // Test two nested, one unclosed block comments. + t!(Both[""]: "/*/*/**/*/" => BlockComment); + + // Test all combinations of up to two following slashes and stars. + t!(Both[""]: "/*" => BlockComment); + t!(Both[""]: "/*/" => BlockComment); + t!(Both[""]: "/**" => BlockComment); + t!(Both[""]: "/*//" => BlockComment); + t!(Both[""]: "/*/*" => BlockComment); + t!(Both[""]: "/**/" => BlockComment); + t!(Both[""]: "/***" => BlockComment); + + // Test unexpected terminator. + t!(Both: "/*Hi*/*/" => BlockComment, + Error(Full, "unexpected end of block comment")); + } +} |
