summaryrefslogtreecommitdiff
path: root/src/syntax/parser.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/syntax/parser.rs')
-rw-r--r--src/syntax/parser.rs558
1 files changed, 558 insertions, 0 deletions
diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs
new file mode 100644
index 00000000..83b333f4
--- /dev/null
+++ b/src/syntax/parser.rs
@@ -0,0 +1,558 @@
+use std::fmt::{self, Display, Formatter};
+use std::mem;
+use std::ops::Range;
+
+use super::{ErrorPos, InnerNode, NodeData, NodeKind, SyntaxNode, TokenMode, Tokens};
+use crate::util::EcoString;
+
+/// A convenient token-based parser.
+pub struct Parser<'s> {
+ /// An iterator over the source tokens.
+ tokens: Tokens<'s>,
+ /// Whether we are at the end of the file or of a group.
+ eof: bool,
+ /// The current token.
+ current: Option<NodeKind>,
+ /// The end byte index of the last non-trivia token.
+ prev_end: usize,
+ /// The start byte index of the peeked token.
+ current_start: usize,
+ /// The stack of open groups.
+ groups: Vec<GroupEntry>,
+ /// The children of the currently built node.
+ children: Vec<SyntaxNode>,
+ /// Whether the last group was not correctly terminated.
+ unterminated_group: bool,
+ /// Whether a group terminator was found that did not close a group.
+ stray_terminator: bool,
+}
+
+impl<'s> Parser<'s> {
+ /// Create a new parser for the source string.
+ pub fn new(text: &'s str, mode: TokenMode) -> Self {
+ Self::with_prefix("", text, mode)
+ }
+
+ /// Create a new parser for the source string that is prefixed by some text
+ /// that does not need to be parsed but taken into account for column
+ /// calculation.
+ pub fn with_prefix(prefix: &str, text: &'s str, mode: TokenMode) -> Self {
+ let mut tokens = Tokens::with_prefix(prefix, text, mode);
+ let current = tokens.next();
+ Self {
+ tokens,
+ eof: current.is_none(),
+ current,
+ prev_end: 0,
+ current_start: 0,
+ groups: vec![],
+ children: vec![],
+ unterminated_group: false,
+ stray_terminator: false,
+ }
+ }
+
+ /// End the parsing process and return the parsed children.
+ pub fn finish(self) -> Vec<SyntaxNode> {
+ self.children
+ }
+
+ /// End the parsing process and return
+ /// - the parsed children and whether the last token was terminated, if all
+ /// groups were terminated correctly, or
+ /// - `None` otherwise.
+ pub fn consume(self) -> Option<(Vec<SyntaxNode>, bool)> {
+ self.terminated().then(|| (self.children, self.tokens.terminated()))
+ }
+
+ /// Create a new marker.
+ pub fn marker(&mut self) -> Marker {
+ Marker(self.children.len())
+ }
+
+ /// Create a marker right before the trailing trivia.
+ pub fn trivia_start(&self) -> Marker {
+ let count = self
+ .children
+ .iter()
+ .rev()
+ .take_while(|node| self.is_trivia(node.kind()))
+ .count();
+ Marker(self.children.len() - count)
+ }
+
+ /// Perform a subparse that wraps its result in a node with the given kind.
+ pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T
+ where
+ F: FnOnce(&mut Self) -> T,
+ {
+ let prev = mem::take(&mut self.children);
+ let output = f(self);
+ let until = self.trivia_start();
+ let mut children = mem::replace(&mut self.children, prev);
+
+ if self.tokens.mode() == TokenMode::Markup {
+ self.children.push(InnerNode::with_children(kind, children).into());
+ } else {
+ // Trailing trivia should not be wrapped into the new node.
+ let idx = self.children.len();
+ self.children.push(SyntaxNode::default());
+ self.children.extend(children.drain(until.0 ..));
+ self.children[idx] = InnerNode::with_children(kind, children).into();
+ }
+
+ output
+ }
+
+ /// Whether the end of the source string or group is reached.
+ pub fn eof(&self) -> bool {
+ self.eof
+ }
+
+ /// Consume the current token and also trailing trivia.
+ pub fn eat(&mut self) {
+ self.stray_terminator |= match self.current {
+ Some(NodeKind::RightParen) => !self.inside(Group::Paren),
+ Some(NodeKind::RightBracket) => !self.inside(Group::Bracket),
+ Some(NodeKind::RightBrace) => !self.inside(Group::Brace),
+ _ => false,
+ };
+
+ self.prev_end = self.tokens.cursor();
+ self.bump();
+
+ if self.tokens.mode() != TokenMode::Markup {
+ // Skip whitespace and comments.
+ while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) {
+ self.bump();
+ }
+ }
+
+ self.repeek();
+ }
+
+ /// Consume the current token if it is the given one.
+ pub fn eat_if(&mut self, kind: NodeKind) -> bool {
+ let at = self.at(kind);
+ if at {
+ self.eat();
+ }
+ at
+ }
+
+ /// Eat tokens while the condition is true.
+ pub fn eat_while<F>(&mut self, mut f: F)
+ where
+ F: FnMut(&NodeKind) -> bool,
+ {
+ while self.peek().map_or(false, |t| f(t)) {
+ self.eat();
+ }
+ }
+
+ /// Consume the current token if it is the given one and produce an error if
+ /// not.
+ pub fn expect(&mut self, kind: NodeKind) -> ParseResult {
+ let at = self.peek() == Some(&kind);
+ if at {
+ self.eat();
+ Ok(())
+ } else {
+ self.expected(kind.name());
+ Err(ParseError)
+ }
+ }
+
+ /// Consume the current token, debug-asserting that it is the given one.
+ #[track_caller]
+ pub fn assert(&mut self, kind: NodeKind) {
+ debug_assert_eq!(self.peek(), Some(&kind));
+ self.eat();
+ }
+
+ /// Whether the current token is of the given type.
+ pub fn at(&self, kind: NodeKind) -> bool {
+ self.peek() == Some(&kind)
+ }
+
+ /// Peek at the current token without consuming it.
+ pub fn peek(&self) -> Option<&NodeKind> {
+ if self.eof { None } else { self.current.as_ref() }
+ }
+
+ /// Peek at the current token, but only if it follows immediately after the
+ /// last one without any trivia in between.
+ pub fn peek_direct(&self) -> Option<&NodeKind> {
+ if self.prev_end() == self.current_start() {
+ self.peek()
+ } else {
+ None
+ }
+ }
+
+ /// Peek at the source of the current token.
+ pub fn peek_src(&self) -> &'s str {
+ self.get(self.current_start() .. self.current_end())
+ }
+
+ /// Obtain a range of the source code.
+ pub fn get(&self, range: Range<usize>) -> &'s str {
+ self.tokens.scanner().get(range)
+ }
+
+ /// The byte index at which the last non-trivia token ended.
+ pub fn prev_end(&self) -> usize {
+ self.prev_end
+ }
+
+ /// The byte index at which the current token starts.
+ pub fn current_start(&self) -> usize {
+ self.current_start
+ }
+
+ /// The byte index at which the current token ends.
+ pub fn current_end(&self) -> usize {
+ self.tokens.cursor()
+ }
+
+ /// Determine the column index for the given byte index.
+ pub fn column(&self, index: usize) -> usize {
+ self.tokens.column(index)
+ }
+
+ /// Continue parsing in a group.
+ ///
+ /// When the end delimiter of the group is reached, all subsequent calls to
+ /// `peek()` return `None`. Parsing can only continue with a matching call
+ /// to `end_group`.
+ ///
+ /// This panics if the current token does not start the given group.
+ #[track_caller]
+ pub fn start_group(&mut self, kind: Group) {
+ self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() });
+ self.tokens.set_mode(match kind {
+ Group::Strong | Group::Emph => TokenMode::Markup,
+ Group::Bracket => match self.tokens.mode() {
+ TokenMode::Math => TokenMode::Math,
+ _ => TokenMode::Markup,
+ },
+ Group::Brace | Group::Paren => match self.tokens.mode() {
+ TokenMode::Math => TokenMode::Math,
+ _ => TokenMode::Code,
+ },
+ Group::Math => TokenMode::Math,
+ Group::Expr | Group::Imports => TokenMode::Code,
+ });
+
+ match kind {
+ Group::Brace => self.assert(NodeKind::LeftBrace),
+ Group::Bracket => self.assert(NodeKind::LeftBracket),
+ Group::Paren => self.assert(NodeKind::LeftParen),
+ Group::Strong => self.assert(NodeKind::Star),
+ Group::Emph => self.assert(NodeKind::Underscore),
+ Group::Math => self.assert(NodeKind::Dollar),
+ Group::Expr => self.repeek(),
+ Group::Imports => self.repeek(),
+ }
+ }
+
+ /// End the parsing of a group.
+ ///
+ /// This panics if no group was started.
+ #[track_caller]
+ pub fn end_group(&mut self) {
+ let group_mode = self.tokens.mode();
+ let group = self.groups.pop().expect("no started group");
+ self.tokens.set_mode(group.prev_mode);
+
+ let mut rescan = self.tokens.mode() != group_mode;
+
+ // Eat the end delimiter if there is one.
+ if let Some((end, required)) = match group.kind {
+ Group::Brace => Some((NodeKind::RightBrace, true)),
+ Group::Bracket => Some((NodeKind::RightBracket, true)),
+ Group::Paren => Some((NodeKind::RightParen, true)),
+ Group::Strong => Some((NodeKind::Star, true)),
+ Group::Emph => Some((NodeKind::Underscore, true)),
+ Group::Math => Some((NodeKind::Dollar, true)),
+ Group::Expr => Some((NodeKind::Semicolon, false)),
+ Group::Imports => None,
+ } {
+ if self.current.as_ref() == Some(&end) {
+ // If another group closes after a group with the missing
+ // terminator, its scope of influence ends here and no longer
+ // taints the rest of the reparse.
+ self.unterminated_group = false;
+
+ // Bump the delimeter and return. No need to rescan in this
+ // case. Also, we know that the delimiter is not stray even
+ // though we already removed the group.
+ let s = self.stray_terminator;
+ self.eat();
+ self.stray_terminator = s;
+ rescan = false;
+ } else if required {
+ self.expected(end.name());
+ self.unterminated_group = true;
+ }
+ }
+
+ // Rescan the peeked token if the mode changed.
+ if rescan {
+ let mut target = self.prev_end();
+ if group_mode != TokenMode::Markup {
+ let start = self.trivia_start().0;
+ target = self.current_start
+ - self.children[start ..].iter().map(SyntaxNode::len).sum::<usize>();
+ self.children.truncate(start);
+ }
+
+ self.tokens.jump(target);
+ self.prev_end = self.tokens.cursor();
+ self.current_start = self.tokens.cursor();
+ self.current = self.tokens.next();
+ }
+
+ self.repeek();
+ }
+
+ /// Checks if all groups were correctly terminated.
+ fn terminated(&self) -> bool {
+ self.groups.is_empty() && !self.unterminated_group && !self.stray_terminator
+ }
+
+ /// Low-level bump that consumes exactly one token without special trivia
+ /// handling.
+ fn bump(&mut self) {
+ let kind = self.current.take().unwrap();
+ let len = self.tokens.cursor() - self.current_start;
+ self.children.push(NodeData::new(kind, len).into());
+ self.current_start = self.tokens.cursor();
+ self.current = self.tokens.next();
+ }
+
+ /// Take another look at the current token to recheck whether it ends a
+ /// group.
+ fn repeek(&mut self) {
+ self.eof = match &self.current {
+ Some(NodeKind::RightBrace) => self.inside(Group::Brace),
+ Some(NodeKind::RightBracket) => self.inside(Group::Bracket),
+ Some(NodeKind::RightParen) => self.inside(Group::Paren),
+ Some(NodeKind::Star) => self.inside(Group::Strong),
+ Some(NodeKind::Underscore) => self.inside(Group::Emph),
+ Some(NodeKind::Dollar) => self.inside(Group::Math),
+ Some(NodeKind::Semicolon) => self.inside(Group::Expr),
+ Some(NodeKind::From) => self.inside(Group::Imports),
+ Some(NodeKind::Space { newlines }) => self.space_ends_group(*newlines),
+ Some(_) => false,
+ None => true,
+ };
+ }
+
+ /// Returns whether the given type can be skipped over.
+ fn is_trivia(&self, token: &NodeKind) -> bool {
+ match token {
+ NodeKind::Space { newlines } => !self.space_ends_group(*newlines),
+ NodeKind::LineComment => true,
+ NodeKind::BlockComment => true,
+ _ => false,
+ }
+ }
+
+ /// Whether a space with the given number of newlines ends the current group.
+ fn space_ends_group(&self, n: usize) -> bool {
+ if n == 0 {
+ return false;
+ }
+
+ match self.groups.last().map(|group| group.kind) {
+ Some(Group::Strong | Group::Emph) => n >= 2,
+ Some(Group::Imports) => n >= 1,
+ Some(Group::Expr) if n >= 1 => {
+ // Allow else and method call to continue on next line.
+ self.groups.iter().nth_back(1).map(|group| group.kind)
+ != Some(Group::Brace)
+ || !matches!(
+ self.tokens.clone().next(),
+ Some(NodeKind::Else | NodeKind::Dot)
+ )
+ }
+ _ => false,
+ }
+ }
+
+ /// Whether we are inside the given group (can be nested).
+ fn inside(&self, kind: Group) -> bool {
+ self.groups
+ .iter()
+ .rev()
+ .take_while(|g| !kind.is_weak() || g.kind.is_weak())
+ .any(|g| g.kind == kind)
+ }
+}
+
+/// Error handling.
+impl Parser<'_> {
+ /// Eat the current token and add an error that it is unexpected.
+ pub fn unexpected(&mut self) {
+ if let Some(found) = self.peek() {
+ let msg = format_eco!("unexpected {}", found.name());
+ let error = NodeKind::Error(ErrorPos::Full, msg);
+ self.perform(error, Self::eat);
+ }
+ }
+
+ /// Add an error that the `thing` was expected at the end of the last
+ /// non-trivia token.
+ pub fn expected(&mut self, thing: &str) {
+ self.expected_at(self.trivia_start(), thing);
+ }
+
+ /// Insert an error message that `what` was expected at the marker position.
+ pub fn expected_at(&mut self, marker: Marker, what: &str) {
+ let msg = format_eco!("expected {}", what);
+ let error = NodeKind::Error(ErrorPos::Full, msg);
+ self.children.insert(marker.0, NodeData::new(error, 0).into());
+ }
+
+ /// Eat the current token and add an error that it is not the expected
+ /// `thing`.
+ pub fn expected_found(&mut self, thing: &str) {
+ match self.peek() {
+ Some(found) => {
+ let msg = format_eco!("expected {}, found {}", thing, found.name());
+ let error = NodeKind::Error(ErrorPos::Full, msg);
+ self.perform(error, Self::eat);
+ }
+ None => self.expected(thing),
+ }
+ }
+}
+
+/// Marks a location in a parser's child list.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub struct Marker(usize);
+
+impl Marker {
+ /// Peek at the child directly before the marker.
+ pub fn before<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> {
+ p.children.get(self.0.checked_sub(1)?)
+ }
+
+ /// Peek at the child directly after the marker.
+ pub fn after<'a>(self, p: &'a Parser) -> Option<&'a SyntaxNode> {
+ p.children.get(self.0)
+ }
+
+ /// Convert the child directly after marker.
+ pub fn convert(self, p: &mut Parser, kind: NodeKind) {
+ if let Some(child) = p.children.get_mut(self.0) {
+ child.convert(kind);
+ }
+ }
+
+ /// Perform a subparse that wraps all children after the marker in a node
+ /// with the given kind.
+ pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T
+ where
+ F: FnOnce(&mut Parser) -> T,
+ {
+ let success = f(p);
+ self.end(p, kind);
+ success
+ }
+
+ /// Wrap all children after the marker (excluding trailing trivia) in a node
+ /// with the given `kind`.
+ pub fn end(self, p: &mut Parser, kind: NodeKind) {
+ let until = p.trivia_start().0.max(self.0);
+ let children = p.children.drain(self.0 .. until).collect();
+ p.children
+ .insert(self.0, InnerNode::with_children(kind, children).into());
+ }
+
+ /// Wrap all children that do not fulfill the predicate in error nodes.
+ pub fn filter_children<F>(self, p: &mut Parser, mut f: F)
+ where
+ F: FnMut(&SyntaxNode) -> Result<(), &'static str>,
+ {
+ for child in &mut p.children[self.0 ..] {
+ // Don't expose errors.
+ if child.kind().is_error() {
+ continue;
+ }
+
+ // Don't expose trivia in code.
+ if p.tokens.mode() != TokenMode::Markup && child.kind().is_trivia() {
+ continue;
+ }
+
+ if let Err(msg) = f(child) {
+ let mut msg = EcoString::from(msg);
+ if msg.starts_with("expected") {
+ msg.push_str(", found ");
+ msg.push_str(child.kind().name());
+ }
+ let error = NodeKind::Error(ErrorPos::Full, msg);
+ let inner = mem::take(child);
+ *child = InnerNode::with_child(error, inner).into();
+ }
+ }
+ }
+}
+
+/// A logical group of tokens, e.g. `[...]`.
+#[derive(Debug)]
+struct GroupEntry {
+ /// The kind of group this is. This decides which token(s) will end the
+ /// group. For example, a [`Group::Paren`] will be ended by
+ /// [`Token::RightParen`].
+ pub kind: Group,
+ /// The mode the parser was in _before_ the group started (to which we go
+ /// back once the group ends).
+ pub prev_mode: TokenMode,
+}
+
+/// A group, confined by optional start and end delimiters.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum Group {
+ /// A curly-braced group: `{...}`.
+ Brace,
+ /// A bracketed group: `[...]`.
+ Bracket,
+ /// A parenthesized group: `(...)`.
+ Paren,
+ /// A group surrounded with stars: `*...*`.
+ Strong,
+ /// A group surrounded with underscore: `_..._`.
+ Emph,
+ /// A group surrounded by dollar signs: `$...$`.
+ Math,
+ /// A group ended by a semicolon or a line break: `;`, `\n`.
+ Expr,
+ /// A group for import items, ended by a semicolon, line break or `from`.
+ Imports,
+}
+
+impl Group {
+ /// Whether the group can only force other weak groups to end.
+ fn is_weak(self) -> bool {
+ matches!(self, Group::Strong | Group::Emph)
+ }
+}
+
+/// Allows parser methods to use the try operator. Never returned top-level
+/// because the parser recovers from all errors.
+pub type ParseResult<T = ()> = Result<T, ParseError>;
+
+/// The error type for parsing.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub struct ParseError;
+
+impl Display for ParseError {
+ fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+ f.pad("failed to parse")
+ }
+}
+
+impl std::error::Error for ParseError {}