summaryrefslogtreecommitdiff
path: root/src/parse/parser.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2021-11-08 13:08:15 +0100
committerGitHub <noreply@github.com>2021-11-08 13:08:15 +0100
commitc6f8ad35f45248f1fd36ee00195966f1629c6ca7 (patch)
tree51faa3f6bbc56f75636823adeea135ed76e1b33b /src/parse/parser.rs
parentea6ee3f667e922ed2f21b08719a45d2395787932 (diff)
parent38c5c362419c5eee7a4fdc0b43d3a9dfb339a6d2 (diff)
Merge pull request #46 from typst/parser-ng
Next Generation Parser
Diffstat (limited to 'src/parse/parser.rs')
-rw-r--r--src/parse/parser.rs556
1 files changed, 310 insertions, 246 deletions
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
index 347d6f71..1c4c2a5c 100644
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@@ -1,250 +1,216 @@
-use std::ops::Range;
+use std::mem;
use super::{TokenMode, Tokens};
-use crate::diag::Error;
-use crate::source::{SourceFile, SourceId};
-use crate::syntax::{IntoSpan, Pos, Span, Token};
+use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind};
+use crate::util::EcoString;
+
+/// Allows parser methods to use the try operator. Not exposed as the parser
+/// recovers from all errors.
+pub(crate) type ParseResult<T = ()> = Result<T, ()>;
/// A convenient token-based parser.
pub struct Parser<'s> {
- /// The parsed file.
- source: &'s SourceFile,
- /// Parsing errors.
- errors: Vec<Error>,
/// An iterator over the source tokens.
tokens: Tokens<'s>,
+ /// Whether we are at the end of the file or of a group.
+ eof: bool,
+ /// The current token.
+ current: Option<NodeKind>,
+ /// The end byte index of the last non-trivia token.
+ prev_end: usize,
+ /// The start byte index of the peeked token.
+ current_start: usize,
/// The stack of open groups.
groups: Vec<GroupEntry>,
- /// The next token.
- next: Option<Token<'s>>,
- /// The peeked token.
- /// (Same as `next` except if we are at the end of group, then `None`).
- peeked: Option<Token<'s>>,
- /// The end index of the last (non-whitespace if in code mode) token.
- prev_end: usize,
- /// The start index of the peeked token.
- next_start: usize,
-}
-
-/// A logical group of tokens, e.g. `[...]`.
-struct GroupEntry {
- /// The kind of group this is. This decides which tokens will end the group.
- /// For example, a [`Group::Paren`] will be ended by
- /// [`Token::RightParen`].
- pub kind: Group,
- /// The start index of the group. Used by `Parser::end_group` to return the
- /// group's full span.
- pub start: usize,
- /// The mode the parser was in _before_ the group started (to which we go
- /// back once the group ends).
- pub prev_mode: TokenMode,
-}
-
-/// A group, confined by optional start and end delimiters.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-pub enum Group {
- /// A parenthesized group: `(...)`.
- Paren,
- /// A bracketed group: `[...]`.
- Bracket,
- /// A curly-braced group: `{...}`.
- Brace,
- /// A group ended by a semicolon or a line break: `;`, `\n`.
- Stmt,
- /// A group for a single expression, ended by a line break.
- Expr,
- /// A group for import items, ended by a semicolon, line break or `from`.
- Imports,
+ /// The children of the currently built node.
+ children: Vec<Green>,
}
impl<'s> Parser<'s> {
/// Create a new parser for the source string.
- pub fn new(source: &'s SourceFile) -> Self {
- let mut tokens = Tokens::new(source.src(), TokenMode::Markup);
- let next = tokens.next();
+ pub fn new(src: &'s str) -> Self {
+ let mut tokens = Tokens::new(src, TokenMode::Markup);
+ let current = tokens.next();
Self {
- source,
- errors: vec![],
tokens,
- groups: vec![],
- next,
- peeked: next,
+ eof: current.is_none(),
+ current,
prev_end: 0,
- next_start: 0,
+ current_start: 0,
+ groups: vec![],
+ children: vec![],
}
}
- /// Finish parsing and return all errors.
- pub fn finish(self) -> Vec<Error> {
- self.errors
+ /// End the parsing process and return the last child.
+ pub fn finish(self) -> Vec<Green> {
+ self.children
}
- /// The id of the parsed source file.
- pub fn id(&self) -> SourceId {
- self.source.id()
+ /// Create a new marker.
+ pub fn marker(&mut self) -> Marker {
+ Marker(self.children.len())
}
- /// Whether the end of the source string or group is reached.
- pub fn eof(&self) -> bool {
- self.peek().is_none()
+ /// Create a markup right before the trailing trivia.
+ pub fn trivia_start(&self) -> Marker {
+ let count = self
+ .children
+ .iter()
+ .rev()
+ .take_while(|node| self.is_trivia(node.kind()))
+ .count();
+ Marker(self.children.len() - count)
}
- /// Consume the next token.
- pub fn eat(&mut self) -> Option<Token<'s>> {
- let token = self.peek()?;
- self.bump();
- Some(token)
+ /// Perform a subparse that wraps its result in a node with the given kind.
+ pub fn perform<F, T>(&mut self, kind: NodeKind, f: F) -> T
+ where
+ F: FnOnce(&mut Self) -> T,
+ {
+ let prev = mem::take(&mut self.children);
+ let output = f(self);
+ let until = self.trivia_start();
+ let mut children = mem::replace(&mut self.children, prev);
+
+ if self.tokens.mode() == TokenMode::Code {
+ // Trailing trivia should not be wrapped into the new node.
+ let idx = self.children.len();
+ self.children.push(Green::default());
+ self.children.extend(children.drain(until.0 ..));
+ self.children[idx] = GreenNode::with_children(kind, children).into();
+ } else {
+ self.children.push(GreenNode::with_children(kind, children).into());
+ }
+
+ output
}
- /// Eat the next token and return its source range.
- pub fn eat_span(&mut self) -> Span {
- let start = self.next_start();
- self.eat();
- Span::new(self.id(), start, self.prev_end())
+ /// Whether the end of the source string or group is reached.
+ pub fn eof(&self) -> bool {
+ self.eof
}
- /// Consume the next token if it is the given one.
- pub fn eat_if(&mut self, t: Token) -> bool {
- if self.peek() == Some(t) {
- self.bump();
- true
- } else {
- false
+ /// Consume the current token and also trailing trivia.
+ pub fn eat(&mut self) {
+ self.prev_end = self.tokens.index();
+ self.bump();
+
+ if self.tokens.mode() == TokenMode::Code {
+ // Skip whitespace and comments.
+ while self.current.as_ref().map_or(false, |x| self.is_trivia(x)) {
+ self.bump();
+ }
}
+
+ self.repeek();
}
- /// Consume the next token if the closure maps it a to `Some`-variant.
- pub fn eat_map<T, F>(&mut self, f: F) -> Option<T>
- where
- F: FnOnce(Token<'s>) -> Option<T>,
- {
- let token = self.peek()?;
- let mapped = f(token);
- if mapped.is_some() {
- self.bump();
+ /// Eat if the current token it is the given one.
+ pub fn eat_if(&mut self, t: &NodeKind) -> bool {
+ let at = self.at(t);
+ if at {
+ self.eat();
}
- mapped
+ at
}
- /// Consume the next token if it is the given one and produce an error if
- /// not.
- pub fn eat_expect(&mut self, t: Token) -> bool {
+ /// Eat if the current token is the given one and produce an error if not.
+ pub fn eat_expect(&mut self, t: &NodeKind) -> ParseResult {
let eaten = self.eat_if(t);
if !eaten {
- self.expected_at(self.prev_end(), t.name());
+ self.expected_at(t.as_str());
}
- eaten
+ if eaten { Ok(()) } else { Err(()) }
}
- /// Consume the next token, debug-asserting that it is one of the given ones.
- pub fn eat_assert(&mut self, t: Token) {
- let next = self.eat();
- debug_assert_eq!(next, Some(t));
+ /// Eat, debug-asserting that the token is the given one.
+ pub fn eat_assert(&mut self, t: &NodeKind) {
+ debug_assert_eq!(self.peek(), Some(t));
+ self.eat();
}
- /// Consume tokens while the condition is true.
+ /// Eat tokens while the condition is true.
pub fn eat_while<F>(&mut self, mut f: F)
where
- F: FnMut(Token<'s>) -> bool,
+ F: FnMut(&NodeKind) -> bool,
{
while self.peek().map_or(false, |t| f(t)) {
self.eat();
}
}
- /// Peek at the next token without consuming it.
- pub fn peek(&self) -> Option<Token<'s>> {
- self.peeked
+ /// Eat the current token, but change its type.
+ pub fn convert(&mut self, kind: NodeKind) {
+ let marker = self.marker();
+ self.eat();
+ marker.convert(self, kind);
}
- /// Peek at the next token if it follows immediately after the last one
- /// without any whitespace in between.
- pub fn peek_direct(&self) -> Option<Token<'s>> {
- if self.next_start() == self.prev_end() {
- self.peeked
- } else {
- None
- }
+ /// Whether the current token is of the given type.
+ pub fn at(&self, kind: &NodeKind) -> bool {
+ self.peek() == Some(kind)
}
- /// Peek at the span of the next token.
- ///
- /// Has length zero if `peek()` returns `None`.
- pub fn peek_span(&self) -> Span {
- Span::new(self.id(), self.next_start(), self.next_end())
+ /// Peek at the current token without consuming it.
+ pub fn peek(&self) -> Option<&NodeKind> {
+ if self.eof { None } else { self.current.as_ref() }
}
- /// Peek at the source of the next token.
- pub fn peek_src(&self) -> &'s str {
- self.get(self.next_start() .. self.next_end())
+ /// Peek at the current token, if it follows immediately after the last one
+ /// without any trivia in between.
+ pub fn peek_direct(&self) -> Option<&NodeKind> {
+ if self.prev_end() == self.current_start() {
+ self.peek()
+ } else {
+ None
+ }
}
- /// Checks whether the next token fulfills a condition.
- ///
- /// Returns `false` if there is no next token.
- pub fn check<F>(&self, f: F) -> bool
- where
- F: FnOnce(Token<'s>) -> bool,
- {
- self.peek().map_or(false, f)
+ /// Peek at the source of the current token.
+ pub fn peek_src(&self) -> &'s str {
+ self.tokens.scanner().get(self.current_start() .. self.current_end())
}
- /// The byte index at which the last token ended.
- ///
- /// Refers to the end of the last _non-whitespace_ token in code mode.
+ /// The byte index at which the last non-trivia token ended.
pub fn prev_end(&self) -> usize {
self.prev_end
}
- /// The byte index at which the next token starts.
- pub fn next_start(&self) -> usize {
- self.next_start
+ /// The byte index at which the current token starts.
+ pub fn current_start(&self) -> usize {
+ self.current_start
}
- /// The byte index at which the next token will end.
- ///
- /// Is the same as [`next_start()`][Self::next_start] if `peek()` returns
- /// `None`.
- pub fn next_end(&self) -> usize {
+ /// The byte index at which the current token ends.
+ pub fn current_end(&self) -> usize {
self.tokens.index()
}
/// Determine the column index for the given byte index.
pub fn column(&self, index: usize) -> usize {
- self.source.byte_to_column(index).unwrap()
- }
-
- /// Slice out part of the source string.
- pub fn get(&self, range: Range<usize>) -> &'s str {
- self.source.get(range).unwrap()
- }
-
- /// The span from `start` to [`self.prev_end()`](Self::prev_end).
- pub fn span_from(&self, start: impl Into<Pos>) -> Span {
- Span::new(self.id(), start, self.prev_end())
+ self.tokens.scanner().column(index)
}
/// Continue parsing in a group.
///
/// When the end delimiter of the group is reached, all subsequent calls to
- /// `eat()` and `peek()` return `None`. Parsing can only continue with
- /// a matching call to `end_group`.
+ /// `peek()` return `None`. Parsing can only continue with a matching call
+ /// to `end_group`.
///
- /// This panics if the next token does not start the given group.
- pub fn start_group(&mut self, kind: Group, mode: TokenMode) {
- self.groups.push(GroupEntry {
- kind,
- start: self.next_start(),
- prev_mode: self.tokens.mode(),
+ /// This panics if the current token does not start the given group.
+ pub fn start_group(&mut self, kind: Group) {
+ self.groups.push(GroupEntry { kind, prev_mode: self.tokens.mode() });
+ self.tokens.set_mode(match kind {
+ Group::Bracket => TokenMode::Markup,
+ _ => TokenMode::Code,
});
- self.tokens.set_mode(mode);
self.repeek();
-
match kind {
- Group::Paren => self.eat_assert(Token::LeftParen),
- Group::Bracket => self.eat_assert(Token::LeftBracket),
- Group::Brace => self.eat_assert(Token::LeftBrace),
+ Group::Paren => self.eat_assert(&NodeKind::LeftParen),
+ Group::Bracket => self.eat_assert(&NodeKind::LeftBracket),
+ Group::Brace => self.eat_assert(&NodeKind::LeftBrace),
Group::Stmt => {}
Group::Expr => {}
Group::Imports => {}
@@ -254,130 +220,228 @@ impl<'s> Parser<'s> {
/// End the parsing of a group.
///
/// This panics if no group was started.
- pub fn end_group(&mut self) -> Span {
- let prev_mode = self.tokens.mode();
+ pub fn end_group(&mut self) {
+ let group_mode = self.tokens.mode();
let group = self.groups.pop().expect("no started group");
self.tokens.set_mode(group.prev_mode);
self.repeek();
- let mut rescan = self.tokens.mode() != prev_mode;
+ let mut rescan = self.tokens.mode() != group_mode;
// Eat the end delimiter if there is one.
if let Some((end, required)) = match group.kind {
- Group::Paren => Some((Token::RightParen, true)),
- Group::Bracket => Some((Token::RightBracket, true)),
- Group::Brace => Some((Token::RightBrace, true)),
- Group::Stmt => Some((Token::Semicolon, false)),
+ Group::Paren => Some((NodeKind::RightParen, true)),
+ Group::Bracket => Some((NodeKind::RightBracket, true)),
+ Group::Brace => Some((NodeKind::RightBrace, true)),
+ Group::Stmt => Some((NodeKind::Semicolon, false)),
Group::Expr => None,
Group::Imports => None,
} {
- if self.next == Some(end) {
+ if self.current.as_ref() == Some(&end) {
// Bump the delimeter and return. No need to rescan in this case.
- self.bump();
+ self.eat();
rescan = false;
} else if required {
- self.error(
- self.next_start() .. self.next_start(),
- format!("expected {}", end.name()),
- );
+ self.push_error(format!("expected {}", end));
}
}
// Rescan the peeked token if the mode changed.
if rescan {
+ if group_mode == TokenMode::Code {
+ self.children.truncate(self.trivia_start().0);
+ }
+
self.tokens.jump(self.prev_end());
- self.bump();
+ self.prev_end = self.tokens.index();
+ self.current_start = self.tokens.index();
+ self.current = self.tokens.next();
+ self.repeek();
}
+ }
- Span::new(self.id(), group.start, self.prev_end())
+ /// Low-level bump that consumes exactly one token without special trivia
+ /// handling.
+ fn bump(&mut self) {
+ let kind = self.current.take().unwrap();
+ let len = self.tokens.index() - self.current_start;
+ self.children.push(GreenData::new(kind, len).into());
+ self.current_start = self.tokens.index();
+ self.current = self.tokens.next();
}
- /// Add an error with location and message.
- pub fn error(&mut self, span: impl IntoSpan, message: impl Into<String>) {
- self.errors.push(Error::new(span.into_span(self.id()), message));
+ /// Take another look at the current token to recheck whether it ends a
+ /// group.
+ fn repeek(&mut self) {
+ self.eof = match &self.current {
+ Some(NodeKind::RightParen) => self.inside(Group::Paren),
+ Some(NodeKind::RightBracket) => self.inside(Group::Bracket),
+ Some(NodeKind::RightBrace) => self.inside(Group::Brace),
+ Some(NodeKind::Semicolon) => self.inside(Group::Stmt),
+ Some(NodeKind::From) => self.inside(Group::Imports),
+ Some(NodeKind::Space(n)) => *n >= 1 && self.stop_at_newline(),
+ Some(_) => false,
+ None => true,
+ };
}
- /// Add an error that `what` was expected at the given span.
- pub fn expected_at(&mut self, span: impl IntoSpan, what: &str) {
- self.error(span, format!("expected {}", what));
+ /// Returns whether the given type can be skipped over.
+ fn is_trivia(&self, token: &NodeKind) -> bool {
+ Self::is_trivia_ext(token, self.stop_at_newline())
}
- /// Eat the next token and add an error that it is not the expected `thing`.
- pub fn expected(&mut self, what: &str) {
- let before = self.next_start();
- if let Some(found) = self.eat() {
- let after = self.prev_end();
- self.error(
- before .. after,
- format!("expected {}, found {}", what, found.name()),
- );
- } else {
- self.expected_at(self.next_start(), what);
+ /// Returns whether the given type can be skipped over given the current
+ /// newline mode.
+ fn is_trivia_ext(token: &NodeKind, stop_at_newline: bool) -> bool {
+ match token {
+ NodeKind::Space(n) => *n == 0 || !stop_at_newline,
+ NodeKind::LineComment => true,
+ NodeKind::BlockComment => true,
+ _ => false,
}
}
- /// Eat the next token and add an error that it is unexpected.
+ /// Whether the active group must end at a newline.
+ fn stop_at_newline(&self) -> bool {
+ matches!(
+ self.groups.last().map(|group| group.kind),
+ Some(Group::Stmt | Group::Expr | Group::Imports)
+ )
+ }
+
+ /// Whether we are inside the given group.
+ fn inside(&self, kind: Group) -> bool {
+ self.groups.iter().any(|g| g.kind == kind)
+ }
+}
+
+/// Error handling.
+impl Parser<'_> {
+ /// Push an error into the children list.
+ pub fn push_error(&mut self, msg: impl Into<EcoString>) {
+ let error = NodeKind::Error(ErrorPos::Full, msg.into());
+ self.children.push(GreenData::new(error, 0).into());
+ }
+
+ /// Eat the current token and add an error that it is unexpected.
pub fn unexpected(&mut self) {
- let before = self.next_start();
- if let Some(found) = self.eat() {
- let after = self.prev_end();
- self.error(before .. after, format!("unexpected {}", found.name()));
+ match self.peek() {
+ Some(found) => {
+ let msg = format!("unexpected {}", found);
+ let error = NodeKind::Error(ErrorPos::Full, msg.into());
+ self.perform(error, Self::eat);
+ }
+ None => self.push_error("unexpected end of file"),
}
}
- /// Move to the next token.
- fn bump(&mut self) {
- self.prev_end = self.tokens.index().into();
- self.next_start = self.tokens.index().into();
- self.next = self.tokens.next();
-
- if self.tokens.mode() == TokenMode::Code {
- // Skip whitespace and comments.
- while match self.next {
- Some(Token::Space(n)) => n < 1 || !self.stop_at_newline(),
- Some(Token::LineComment(_)) => true,
- Some(Token::BlockComment(_)) => true,
- _ => false,
- } {
- self.next_start = self.tokens.index().into();
- self.next = self.tokens.next();
+ /// Eat the current token and add an error that it is not the expected `thing`.
+ pub fn expected(&mut self, thing: &str) {
+ match self.peek() {
+ Some(found) => {
+ let msg = format!("expected {}, found {}", thing, found);
+ let error = NodeKind::Error(ErrorPos::Full, msg.into());
+ self.perform(error, Self::eat);
}
+ None => self.expected_at(thing),
}
+ }
- self.repeek();
+ /// Add an error that the `thing` was expected at the end of the last
+ /// non-trivia token.
+ pub fn expected_at(&mut self, thing: &str) {
+ self.trivia_start().expected(self, thing);
}
+}
- /// Take another look at the next token to recheck whether it ends a group.
- fn repeek(&mut self) {
- self.peeked = self.next;
- let token = match self.next {
- Some(token) => token,
- None => return,
- };
+/// A marker that indicates where a node may start.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub struct Marker(usize);
- if match token {
- Token::RightParen => self.inside(Group::Paren),
- Token::RightBracket => self.inside(Group::Bracket),
- Token::RightBrace => self.inside(Group::Brace),
- Token::Semicolon => self.inside(Group::Stmt),
- Token::From => self.inside(Group::Imports),
- Token::Space(n) => n >= 1 && self.stop_at_newline(),
- _ => false,
- } {
- self.peeked = None;
+impl Marker {
+ /// Perform a subparse that wraps all children after the marker in a node
+ /// with the given kind.
+ pub fn perform<T, F>(self, p: &mut Parser, kind: NodeKind, f: F) -> T
+ where
+ F: FnOnce(&mut Parser) -> T,
+ {
+ let success = f(p);
+ self.end(p, kind);
+ success
+ }
+
+ /// Wrap all children after the marker (excluding trailing trivia) in a node
+ /// with the given `kind`.
+ pub fn end(self, p: &mut Parser, kind: NodeKind) {
+ let until = p.trivia_start();
+ let children = p.children.drain(self.0 .. until.0).collect();
+ p.children
+ .insert(self.0, GreenNode::with_children(kind, children).into());
+ }
+
+ /// Wrap all children that do not fulfill the predicate in error nodes.
+ pub fn filter_children<F>(self, p: &mut Parser, f: F)
+ where
+ F: Fn(&Green) -> Result<(), &'static str>,
+ {
+ for child in &mut p.children[self.0 ..] {
+ if (p.tokens.mode() == TokenMode::Markup
+ || !Parser::is_trivia_ext(child.kind(), false))
+ && !child.kind().is_error()
+ {
+ if let Err(msg) = f(child) {
+ let error = NodeKind::Error(ErrorPos::Full, msg.into());
+ let inner = mem::take(child);
+ *child = GreenNode::with_child(error, inner).into();
+ }
+ }
}
}
- /// Whether the active group ends at a newline.
- fn stop_at_newline(&self) -> bool {
- matches!(
- self.groups.last().map(|group| group.kind),
- Some(Group::Stmt | Group::Expr | Group::Imports)
- )
+ /// Insert an error message that `what` was expected at the marker position.
+ pub fn expected(self, p: &mut Parser, what: &str) {
+ let msg = format!("expected {}", what);
+ let error = NodeKind::Error(ErrorPos::Full, msg.into());
+ p.children.insert(self.0, GreenData::new(error, 0).into());
}
- /// Whether we are inside the given group.
- fn inside(&self, kind: Group) -> bool {
- self.groups.iter().any(|g| g.kind == kind)
+ /// Peek at the child directly after the marker.
+ pub fn peek<'a>(self, p: &'a Parser) -> Option<&'a Green> {
+ p.children.get(self.0)
}
+
+ /// Convert the child directly after marker.
+ pub fn convert(self, p: &mut Parser, kind: NodeKind) {
+ if let Some(child) = p.children.get_mut(self.0) {
+ child.convert(kind);
+ }
+ }
+}
+
+/// A logical group of tokens, e.g. `[...]`.
+struct GroupEntry {
+ /// The kind of group this is. This decides which tokens will end the group.
+ /// For example, a [`Group::Paren`] will be ended by
+ /// [`Token::RightParen`].
+ pub kind: Group,
+ /// The mode the parser was in _before_ the group started (to which we go
+ /// back once the group ends).
+ pub prev_mode: TokenMode,
+}
+
+/// A group, confined by optional start and end delimiters.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum Group {
+ /// A bracketed group: `[...]`.
+ Bracket,
+ /// A curly-braced group: `{...}`.
+ Brace,
+ /// A parenthesized group: `(...)`.
+ Paren,
+ /// A group ended by a semicolon or a line break: `;`, `\n`.
+ Stmt,
+ /// A group for a single expression, ended by a line break.
+ Expr,
+ /// A group for import items, ended by a semicolon, line break or `from`.
+ Imports,
}