summaryrefslogtreecommitdiff
path: root/src/parsing/mod.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2019-10-24 19:10:03 +0200
committerLaurenz <laurmaedje@gmail.com>2019-10-24 19:10:03 +0200
commitb4be25e43b1ee9da924d13b7f2e8289f12bd2c3b (patch)
treea21fbec7e6907993463d57b8c75758d2014cebeb /src/parsing/mod.rs
parentecf0ff4d054f11c79ec0ddbbdf45f3dfcf9fc8d7 (diff)
Prettify peeking and rearrange syntax/parsing modules 🧢
Diffstat (limited to 'src/parsing/mod.rs')
-rw-r--r--src/parsing/mod.rs726
1 files changed, 0 insertions, 726 deletions
diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs
deleted file mode 100644
index 36920595..00000000
--- a/src/parsing/mod.rs
+++ /dev/null
@@ -1,726 +0,0 @@
-//! Parsing of source code into token streams and syntax trees.
-
-use std::collections::HashMap;
-
-use unicode_xid::UnicodeXID;
-
-use crate::func::{Function, Scope};
-use crate::size::Size;
-use crate::syntax::*;
-
-mod tokens;
-
-pub use tokens::{tokenize, Tokens};
-
-/// Parses source code into a syntax tree given a context.
-#[inline]
-pub fn parse(src: &str, ctx: ParseContext) -> ParseResult<SyntaxTree> {
- Parser::new(src, ctx).parse()
-}
-
-/// The context for parsing.
-#[derive(Debug, Copy, Clone)]
-pub struct ParseContext<'a> {
- /// The scope containing function definitions.
- pub scope: &'a Scope,
-}
-
-/// Transforms token streams to syntax trees.
-#[derive(Debug)]
-struct Parser<'s> {
- src: &'s str,
- tokens: PeekableTokens<'s>,
- state: ParserState,
- ctx: ParseContext<'s>,
- tree: SyntaxTree,
-}
-
-/// The state the parser is in.
-#[derive(Debug, Copy, Clone, Eq, PartialEq)]
-enum ParserState {
- /// The base state of the parser.
- Body,
- /// We saw one newline already.
- FirstNewline,
- /// We wrote a newline.
- WroteNewline,
-}
-
-impl<'s> Parser<'s> {
- /// Create a new parser from the source and the context.
- fn new(src: &'s str, ctx: ParseContext<'s>) -> Parser<'s> {
- Parser {
- src,
- tokens: PeekableTokens::new(tokenize(src)),
- state: ParserState::Body,
- ctx,
- tree: SyntaxTree::new(),
- }
- }
-
- /// Parse the source into an abstract syntax tree.
- fn parse(mut self) -> ParseResult<SyntaxTree> {
- // Loop through all the tokens.
- while self.tokens.peek().is_some() {
- self.parse_white()?;
- self.parse_body_part()?;
- }
-
- Ok(self.tree)
- }
-
- /// Parse the next part of the body.
- fn parse_body_part(&mut self) -> ParseResult<()> {
- if let Some(token) = self.tokens.peek() {
- match token {
- // Functions
- Token::LeftBracket => self.parse_func()?,
- Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")),
-
- // Modifiers
- Token::Underscore => self.append_consumed(Node::ToggleItalics),
- Token::Star => self.append_consumed(Node::ToggleBold),
- Token::Backtick => self.append_consumed(Node::ToggleMonospace),
-
- // Normal text
- Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())),
-
- Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token),
-
- // The rest is handled elsewhere or should not happen, because `Tokens` does not
- // yield colons or equals in the body, but their text equivalents instead.
- _ => panic!("unexpected token: {:?}", token),
- }
- }
- Ok(())
- }
-
- /// Parse a complete function from the current position.
- fn parse_func(&mut self) -> ParseResult<()> {
- // This should only be called if a left bracket was seen.
- assert!(self.tokens.next() == Some(Token::LeftBracket));
-
- let header = self.parse_func_header()?;
- let body = self.parse_func_body(&header)?;
-
- // Finally this function is parsed to the end.
- self.append(Node::Func(FuncCall { header, body }));
-
- Ok(self.switch(ParserState::Body))
- }
-
- /// Parse a function header.
- fn parse_func_header(&mut self) -> ParseResult<FuncHeader> {
- // The next token should be the name of the function.
- self.skip_white();
- let name = match self.tokens.next() {
- Some(Token::Text(word)) => {
- if is_identifier(word) {
- Ok(word.to_owned())
- } else {
- Err(ParseError::new(format!("invalid identifier: '{}'", word)))
- }
- }
- _ => Err(ParseError::new("expected identifier")),
- }?;
-
- let mut header = FuncHeader {
- name,
- args: vec![],
- kwargs: HashMap::new(),
- };
-
- self.skip_white();
-
- // Check for arguments
- match self.tokens.next() {
- Some(Token::RightBracket) => {}
- Some(Token::Colon) => {
- let (args, kwargs) = self.parse_func_args()?;
- header.args = args;
- header.kwargs = kwargs;
- }
- _ => {
- return Err(ParseError::new(
- "expected function arguments or closing bracket",
- ))
- }
- }
-
- // Store the header information of the function invocation.
- Ok(header)
- }
-
- /// Parse the arguments to a function.
- fn parse_func_args(&mut self) -> ParseResult<(Vec<Expression>, HashMap<String, Expression>)> {
- let mut args = Vec::new();
- let kwargs = HashMap::new();
-
- let mut comma = false;
- loop {
- self.skip_white();
-
- match self.tokens.peek() {
- Some(Token::Text(_)) | Some(Token::Quoted(_)) if !comma => {
- args.push(self.parse_expression()?);
- comma = true;
- }
-
- Some(Token::Comma) if comma => {
- self.advance();
- comma = false
- }
- Some(Token::RightBracket) => {
- self.advance();
- break;
- }
-
- _ if comma => return Err(ParseError::new("expected comma or closing bracket")),
- _ => return Err(ParseError::new("expected closing bracket")),
- }
- }
-
- Ok((args, kwargs))
- }
-
- /// Parse an expression.
- fn parse_expression(&mut self) -> ParseResult<Expression> {
- Ok(match self.tokens.next() {
- Some(Token::Quoted(text)) => Expression::Str(text.to_owned()),
- Some(Token::Text(text)) => {
- if let Ok(b) = text.parse::<bool>() {
- Expression::Bool(b)
- } else if let Ok(num) = text.parse::<f64>() {
- Expression::Number(num)
- } else if let Ok(size) = text.parse::<Size>() {
- Expression::Size(size)
- } else {
- Expression::Ident(text.to_owned())
- }
- }
- _ => return Err(ParseError::new("expected expression")),
- })
- }
-
- /// Parse the body of a function.
- fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult<Box<dyn Function>> {
- // Whether the function has a body.
- let has_body = self.tokens.peek() == Some(Token::LeftBracket);
- if has_body {
- self.advance();
- }
-
- // Now we want to parse this function dynamically.
- let parser = self
- .ctx
- .scope
- .get_parser(&header.name)
- .ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?;
-
- // Do the parsing dependent on whether the function has a body.
- Ok(if has_body {
- // Find out the string which makes the body of this function.
- let (start, end) = self
- .tokens
- .current_index()
- .and_then(|index| {
- find_closing_bracket(&self.src[index..]).map(|end| (index, index + end))
- })
- .ok_or_else(|| ParseError::new("expected closing bracket"))?;
-
- // Parse the body.
- let body_string = &self.src[start..end];
- let body = parser(&header, Some(body_string), self.ctx)?;
-
- // Skip to the end of the function in the token stream.
- self.tokens.goto(end);
-
- // Now the body should be closed.
- assert!(self.tokens.next() == Some(Token::RightBracket));
-
- body
- } else {
- parser(&header, None, self.ctx)?
- })
- }
-
- /// Parse whitespace (as long as there is any) and skip over comments.
- fn parse_white(&mut self) -> ParseResult<()> {
- while let Some(token) = self.tokens.peek() {
- match self.state {
- ParserState::FirstNewline => match token {
- Token::Newline => {
- self.append_consumed(Node::Newline);
- self.switch(ParserState::WroteNewline);
- }
- Token::Space => self.append_space_consumed(),
- _ => {
- self.append_space();
- self.switch(ParserState::Body);
- }
- },
- ParserState::WroteNewline => match token {
- Token::Newline | Token::Space => self.append_space_consumed(),
- _ => self.switch(ParserState::Body),
- },
- ParserState::Body => match token {
- // Whitespace
- Token::Space => self.append_space_consumed(),
- Token::Newline => {
- self.advance();
- self.switch(ParserState::FirstNewline);
- }
-
- // Comments
- Token::LineComment(_) | Token::BlockComment(_) => self.advance(),
- Token::StarSlash => {
- return Err(ParseError::new("unexpected end of block comment"));
- }
-
- // Anything else skips out of the function.
- _ => break,
- },
- }
- }
-
- Ok(())
- }
-
- /// Skip over whitespace and comments.
- fn skip_white(&mut self) {
- while let Some(token) = self.tokens.peek() {
- match token {
- Token::Space | Token::Newline | Token::LineComment(_) | Token::BlockComment(_) => {
- self.advance()
- }
- _ => break,
- }
- }
- }
-
- /// Advance the iterator by one step.
- fn advance(&mut self) {
- self.tokens.next();
- }
-
- /// Switch the state.
- fn switch(&mut self, state: ParserState) {
- self.state = state;
- }
-
- /// Append a node to the tree.
- fn append(&mut self, node: Node) {
- self.tree.nodes.push(node);
- }
-
- /// Append a space if there is not one already.
- fn append_space(&mut self) {
- if self.tree.nodes.last() != Some(&Node::Space) {
- self.append(Node::Space);
- }
- }
-
- /// Advance and return the given node.
- fn append_consumed(&mut self, node: Node) {
- self.advance();
- self.append(node);
- }
-
- /// Advance and append a space if there is not one already.
- fn append_space_consumed(&mut self) {
- self.advance();
- self.append_space();
- }
-}
-
-/// Find the index of the first unbalanced and unescaped closing bracket.
-fn find_closing_bracket(src: &str) -> Option<usize> {
- let mut parens = 0;
- let mut escaped = false;
- for (index, c) in src.char_indices() {
- match c {
- '\\' => {
- escaped = !escaped;
- continue;
- }
- ']' if !escaped && parens == 0 => return Some(index),
- '[' if !escaped => parens += 1,
- ']' if !escaped => parens -= 1,
- _ => {}
- }
- escaped = false;
- }
- None
-}
-
-/// A peekable iterator for tokens which allows access to the original iterator
-/// inside this module (which is needed by the parser).
-#[derive(Debug, Clone)]
-struct PeekableTokens<'s> {
- tokens: Tokens<'s>,
- peeked: Option<Option<Token<'s>>>,
-}
-
-impl<'s> PeekableTokens<'s> {
- /// Create a new iterator from a string.
- fn new(tokens: Tokens<'s>) -> PeekableTokens<'s> {
- PeekableTokens {
- tokens,
- peeked: None,
- }
- }
-
- /// Peek at the next element.
- fn peek(&mut self) -> Option<Token<'s>> {
- let iter = &mut self.tokens;
- *self.peeked.get_or_insert_with(|| iter.next())
- }
-
- /// The index of the first character of the next token in the source string.
- fn current_index(&mut self) -> Option<usize> {
- self.tokens.chars.current_index()
- }
-
- /// Go to a new position in the underlying string.
- fn goto(&mut self, index: usize) {
- self.tokens.chars.goto(index);
- self.peeked = None;
- }
-}
-
-impl<'s> Iterator for PeekableTokens<'s> {
- type Item = Token<'s>;
-
- fn next(&mut self) -> Option<Token<'s>> {
- match self.peeked.take() {
- Some(value) => value,
- None => self.tokens.next(),
- }
- }
-}
-
-/// Whether this word is a valid unicode identifier.
-fn is_identifier(string: &str) -> bool {
- let mut chars = string.chars();
-
- match chars.next() {
- Some(c) if c != '.' && !UnicodeXID::is_xid_start(c) => return false,
- None => return false,
- _ => (),
- }
-
- while let Some(c) = chars.next() {
- if c != '.' && !UnicodeXID::is_xid_continue(c) {
- return false;
- }
- }
-
- true
-}
-
-/// The error type for parsing.
-pub struct ParseError(String);
-
-/// The result type for parsing.
-pub type ParseResult<T> = Result<T, ParseError>;
-
-impl ParseError {
- /// Create a new parse error with a message.
- pub fn new<S: Into<String>>(message: S) -> ParseError {
- ParseError(message.into())
- }
-}
-
-error_type! {
- err: ParseError,
- show: f => f.write_str(&err.0),
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use crate::func::{CommandList, Function, Scope};
- use crate::layout::{LayoutContext, LayoutResult};
- use funcs::*;
- use Node::{Func as F, Newline as N, Space as S};
-
- /// Two test functions, one which parses it's body as another syntax tree
- /// and another one which does not expect a body.
- mod funcs {
- use super::*;
-
- /// A testing function which just parses it's body into a syntax tree.
- #[derive(Debug, PartialEq)]
- pub struct TreeFn(pub SyntaxTree);
-
- impl Function for TreeFn {
- fn parse(_: &FuncHeader, body: Option<&str>, ctx: ParseContext) -> ParseResult<Self>
- where Self: Sized {
- if let Some(src) = body {
- parse(src, ctx).map(|tree| TreeFn(tree))
- } else {
- Err(ParseError::new("expected body for tree fn"))
- }
- }
-
- fn layout(&self, _: LayoutContext) -> LayoutResult<CommandList> {
- Ok(CommandList::new())
- }
- }
-
- /// A testing function without a body.
- #[derive(Debug, PartialEq)]
- pub struct BodylessFn;
-
- impl Function for BodylessFn {
- fn parse(_: &FuncHeader, body: Option<&str>, _: ParseContext) -> ParseResult<Self>
- where Self: Sized {
- if body.is_none() {
- Ok(BodylessFn)
- } else {
- Err(ParseError::new("unexpected body for bodyless fn"))
- }
- }
-
- fn layout(&self, _: LayoutContext) -> LayoutResult<CommandList> {
- Ok(CommandList::new())
- }
- }
- }
-
- /// Test if the source code parses into the syntax tree.
- fn test(src: &str, tree: SyntaxTree) {
- let ctx = ParseContext {
- scope: &Scope::new(),
- };
- assert_eq!(parse(src, ctx).unwrap(), tree);
- }
-
- /// Test with a scope containing function definitions.
- fn test_scoped(scope: &Scope, src: &str, tree: SyntaxTree) {
- let ctx = ParseContext { scope };
- assert_eq!(parse(src, ctx).unwrap(), tree);
- }
-
- /// Test if the source parses into the error.
- fn test_err(src: &str, err: &str) {
- let ctx = ParseContext {
- scope: &Scope::new(),
- };
- assert_eq!(parse(src, ctx).unwrap_err().to_string(), err);
- }
-
- /// Test with a scope if the source parses into the error.
- fn test_err_scoped(scope: &Scope, src: &str, err: &str) {
- let ctx = ParseContext { scope };
- assert_eq!(parse(src, ctx).unwrap_err().to_string(), err);
- }
-
- /// Create a text node.
- #[allow(non_snake_case)]
- fn T(s: &str) -> Node {
- Node::Text(s.to_owned())
- }
-
- /// Shortcut macro to create a syntax tree. Is `vec`-like and the elements
- /// are the nodes.
- macro_rules! tree {
- ($($x:expr),*) => (
- SyntaxTree { nodes: vec![$($x),*] }
- );
- ($($x:expr,)*) => (tree![$($x),*])
- }
-
- /// Shortcut macro to create a function.
- macro_rules! func {
- (name => $name:expr, body => None $(,)*) => {
- func!(@$name, Box::new(BodylessFn))
- };
- (name => $name:expr, body => $tree:expr $(,)*) => {
- func!(@$name, Box::new(TreeFn($tree)))
- };
- (@$name:expr, $body:expr) => {
- FuncCall {
- header: FuncHeader {
- name: $name.to_string(),
- args: vec![],
- kwargs: HashMap::new(),
- },
- body: $body,
- }
- }
- }
-
- /// Parse the basic cases.
- #[test]
- #[rustfmt::skip]
- fn parse_base() {
- test("", tree! []);
- test("Hello World!", tree! [ T("Hello"), S, T("World!") ]);
- }
-
- /// Test whether newlines generate the correct whitespace.
- #[test]
- #[rustfmt::skip]
- fn parse_newlines_whitespace() {
- test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]);
- test("Hello \n World", tree! [ T("Hello"), S, T("World") ]);
- test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]);
- test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]);
- test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]);
- test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]);
- test("Hello\n \n\n World", tree! [ T("Hello"), S, N, S, T("World") ]);
- }
-
- /// Parse things dealing with functions.
- #[test]
- #[rustfmt::skip]
- fn parse_functions() {
- let mut scope = Scope::new();
- scope.add::<BodylessFn>("test");
- scope.add::<BodylessFn>("end");
- scope.add::<TreeFn>("modifier");
- scope.add::<TreeFn>("func");
-
- test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]);
- test_scoped(&scope,"[ test]", tree! [ F(func! { name => "test", body => None }) ]);
- test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [
- T("This"), S, T("is"), S, T("an"), S,
- F(func! { name => "modifier", body => tree! [ T("example") ] }), S,
- T("of"), S, T("a"), S, T("function"), S, T("invocation.")
- ]);
- test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [
- F(func! { name => "func", body => tree! [ T("Hello") ] }),
- F(func! { name => "modifier", body => tree! [ T("Here") ] }),
- F(func! { name => "end", body => None }),
- ]);
- test_scoped(&scope, "[func][]", tree! [ F(func! { name => "func", body => tree! [] }) ]);
- test_scoped(&scope, "[modifier][[func][call]] outside", tree! [
- F(func! {
- name => "modifier",
- body => tree! [ F(func! { name => "func", body => tree! [ T("call") ] }) ],
- }),
- S, T("outside")
- ]);
-
- }
-
- /// Parse functions with arguments.
- #[test]
- #[rustfmt::skip]
- fn parse_function_args() {
- use Expression::{Number as N, Size as Z, Bool as B};
-
- #[allow(non_snake_case)]
- fn S(string: &str) -> Expression { Expression::Str(string.to_owned()) }
- #[allow(non_snake_case)]
- fn I(string: &str) -> Expression { Expression::Ident(string.to_owned()) }
-
- fn func(name: &str, args: Vec<Expression>) -> SyntaxTree {
- tree! [
- F(FuncCall {
- header: FuncHeader {
- name: name.to_string(),
- args,
- kwargs: HashMap::new(),
- },
- body: Box::new(BodylessFn)
- })
- ]
- }
-
- let mut scope = Scope::new();
- scope.add::<BodylessFn>("align");
-
- test_scoped(&scope, "[align: left]", func("align", vec![I("left")]));
- test_scoped(&scope, "[align: left,right]", func("align", vec![I("left"), I("right")]));
- test_scoped(&scope, "[align: left, right]", func("align", vec![I("left"), I("right")]));
- test_scoped(&scope, "[align: \"hello\"]", func("align", vec![S("hello")]));
- test_scoped(&scope, r#"[align: "hello\"world"]"#, func("align", vec![S(r#"hello\"world"#)]));
- test_scoped(&scope, "[align: 12]", func("align", vec![N(12.0)]));
- test_scoped(&scope, "[align: 17.53pt]", func("align", vec![Z(Size::pt(17.53))]));
- test_scoped(&scope, "[align: 2.4in]", func("align", vec![Z(Size::inches(2.4))]));
- test_scoped(&scope, "[align: true, 10mm, left, \"hi, there\"]",
- func("align", vec![B(true), Z(Size::mm(10.0)), I("left"), S("hi, there")]));
- }
-
- /// Parse comments (line and block).
- #[test]
- #[rustfmt::skip]
- fn parse_comments() {
- let mut scope = Scope::new();
- scope.add::<BodylessFn>("test");
- scope.add::<TreeFn>("func");
-
- test_scoped(&scope, "Text\n// Comment\n More text",
- tree! [ T("Text"), S, T("More"), S, T("text") ]);
- test_scoped(&scope, "[test/*world*/]",
- tree! [ F(func! { name => "test", body => None }) ]);
- test_scoped(&scope, "[test/*]*/]",
- tree! [ F(func! { name => "test", body => None }) ]);
- }
-
- /// Test if escaped, but unbalanced parens are correctly parsed.
- #[test]
- #[rustfmt::skip]
- fn parse_unbalanced_body_parens() {
- let mut scope = Scope::new();
- scope.add::<TreeFn>("code");
-
- test_scoped(&scope, r"My [code][Close \]] end", tree! [
- T("My"), S, F(func! {
- name => "code",
- body => tree! [ T("Close"), S, T("]") ]
- }), S, T("end")
- ]);
- test_scoped(&scope, r"My [code][\[ Open] end", tree! [
- T("My"), S, F(func! {
- name => "code",
- body => tree! [ T("["), S, T("Open") ]
- }), S, T("end")
- ]);
- test_scoped(&scope, r"My [code][Open \] and \[ close]end", tree! [
- T("My"), S, F(func! {
- name => "code",
- body => tree! [ T("Open"), S, T("]"), S, T("and"), S, T("["), S, T("close") ]
- }), T("end")
- ]);
- }
-
- /// Tests if the parser handles non-ASCII stuff correctly.
- #[test]
- #[rustfmt::skip]
- fn parse_unicode() {
- let mut scope = Scope::new();
- scope.add::<BodylessFn>("func");
- scope.add::<TreeFn>("bold");
-
- test_scoped(&scope, "[func] ⺐.", tree! [
- F(func! {
- name => "func",
- body => None,
- }),
- S, T("⺐.")
- ]);
- test_scoped(&scope, "[bold][Hello 🌍!]", tree! [
- F(func! {
- name => "bold",
- body => tree! [ T("Hello"), S, T("🌍!") ],
- })
- ]);
- }
-
- /// Tests whether errors get reported correctly.
- #[test]
- #[rustfmt::skip]
- fn parse_errors() {
- let mut scope = Scope::new();
- scope.add::<TreeFn>("hello");
-
- test_err("No functions here]", "unexpected closing bracket");
- test_err_scoped(&scope, "[hello][world", "expected closing bracket");
- test_err("[hello world", "expected function arguments or closing bracket");
- test_err("[ no-name][Why?]", "invalid identifier: 'no-name'");
- test_err("Hello */", "unexpected end of block comment");
- }
-}