diff options
| author | Laurenz <laurmaedje@gmail.com> | 2019-10-11 17:53:28 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2019-10-11 17:53:28 +0200 |
| commit | c0e4fd55e6fa738cfc5dcc851d0fc3ee2d0f2cd2 (patch) | |
| tree | e5531e605d0ab9e06dc950b4cd9b7a8caa116d34 /src/parsing/mod.rs | |
| parent | 8f788f9a4f5e970bbe6147987b711470d57aca8d (diff) | |
Create test runner which renders layouts to images πΊ
Diffstat (limited to 'src/parsing/mod.rs')
| -rw-r--r-- | src/parsing/mod.rs | 698 |
1 files changed, 698 insertions, 0 deletions
diff --git a/src/parsing/mod.rs b/src/parsing/mod.rs new file mode 100644 index 00000000..344f3577 --- /dev/null +++ b/src/parsing/mod.rs @@ -0,0 +1,698 @@ +//! Parsing of source code into token streams an syntax trees. + +use std::collections::HashMap; +use unicode_xid::UnicodeXID; + +use crate::func::{Function, Scope}; +use crate::syntax::*; +use crate::size::Size; + +mod tokens; +pub use tokens::{tokenize, Tokens}; + + +/// Parses source code into a syntax tree given a context. +#[inline] +pub fn parse(src: &str, ctx: ParseContext) -> ParseResult<SyntaxTree> { + Parser::new(src, ctx).parse() +} + +/// The context for parsing. +#[derive(Debug, Copy, Clone)] +pub struct ParseContext<'a> { + /// The scope containing function definitions. + pub scope: &'a Scope, +} + +/// Transforms token streams to syntax trees. +#[derive(Debug)] +struct Parser<'s> { + src: &'s str, + tokens: PeekableTokens<'s>, + state: ParserState, + ctx: ParseContext<'s>, + tree: SyntaxTree, +} + +/// The state the parser is in. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +enum ParserState { + /// The base state of the parser. + Body, + /// We saw one newline already. + FirstNewline, + /// We wrote a newline. + WroteNewline, +} + +impl<'s> Parser<'s> { + /// Create a new parser from the source and the context. + fn new(src: &'s str, ctx: ParseContext<'s>) -> Parser<'s> { + Parser { + src, + tokens: PeekableTokens::new(tokenize(src)), + state: ParserState::Body, + ctx, + tree: SyntaxTree::new(), + } + } + + /// Parse the source into an abstract syntax tree. + fn parse(mut self) -> ParseResult<SyntaxTree> { + // Loop through all the tokens. + while self.tokens.peek().is_some() { + self.parse_white()?; + self.parse_body_part()?; + } + + Ok(self.tree) + } + + /// Parse the next part of the body. + fn parse_body_part(&mut self) -> ParseResult<()> { + if let Some(token) = self.tokens.peek() { + match token { + // Functions + Token::LeftBracket => self.parse_func()?, + Token::RightBracket => return Err(ParseError::new("unexpected closing bracket")), + + // Modifiers + Token::Underscore => self.append_consumed(Node::ToggleItalics), + Token::Star => self.append_consumed(Node::ToggleBold), + Token::Backtick => self.append_consumed(Node::ToggleMonospace), + + // Normal text + Token::Text(word) => self.append_consumed(Node::Text(word.to_owned())), + + Token::Colon | Token::Equals => panic!("bad token for body: {:?}", token), + + // The rest is handled elsewhere or should not happen, because `Tokens` does not + // yield colons or equals in the body, but their text equivalents instead. + _ => panic!("unexpected token: {:?}", token), + } + } + Ok(()) + } + + /// Parse a complete function from the current position. + fn parse_func(&mut self) -> ParseResult<()> { + // This should only be called if a left bracket was seen. + assert!(self.tokens.next() == Some(Token::LeftBracket)); + + let header = self.parse_func_header()?; + let body = self.parse_func_body(&header)?; + + // Finally this function is parsed to the end. + self.append(Node::Func(FuncCall { + header, + body, + })); + + Ok(self.switch(ParserState::Body)) + } + + /// Parse a function header. + fn parse_func_header(&mut self) -> ParseResult<FuncHeader> { + // The next token should be the name of the function. + self.skip_white(); + let name = match self.tokens.next() { + Some(Token::Text(word)) => { + if is_identifier(word) { + Ok(word.to_owned()) + } else { + Err(ParseError::new(format!("invalid identifier: '{}'", word))) + } + }, + _ => Err(ParseError::new("expected identifier")), + }?; + + let mut header = FuncHeader { + name, + args: vec![], + kwargs: HashMap::new(), + }; + + self.skip_white(); + + // Check for arguments + match self.tokens.next() { + Some(Token::RightBracket) => {}, + Some(Token::Colon) => { + let (args, kwargs) = self.parse_func_args()?; + header.args = args; + header.kwargs = kwargs; + }, + _ => return Err(ParseError::new("expected function arguments or closing bracket")), + } + + // Store the header information of the function invocation. + Ok(header) + } + + /// Parse the arguments to a function. + fn parse_func_args(&mut self) -> ParseResult<(Vec<Expression>, HashMap<String, Expression>)> { + let mut args = vec![]; + let kwargs = HashMap::new(); + + let mut comma = false; + loop { + self.skip_white(); + + match self.tokens.peek() { + Some(Token::Text(_)) | Some(Token::Quoted(_)) if !comma => { + args.push(self.parse_expression()?); + comma = true; + }, + + Some(Token::Comma) if comma => { + self.advance(); + comma = false + }, + Some(Token::RightBracket) => { + self.advance(); + break + }, + + _ if comma => return Err(ParseError::new("expected comma or closing bracket")), + _ => return Err(ParseError::new("expected closing bracket")), + } + } + + Ok((args, kwargs)) + } + + /// Parse an expression. + fn parse_expression(&mut self) -> ParseResult<Expression> { + Ok(match self.tokens.next() { + Some(Token::Quoted(text)) => Expression::Str(text.to_owned()), + Some(Token::Text(text)) => { + if let Ok(b) = text.parse::<bool>() { + Expression::Bool(b) + } else if let Ok(num) = text.parse::<f64>() { + Expression::Number(num) + } else if let Ok(size) = text.parse::<Size>() { + Expression::Size(size) + } else { + Expression::Ident(text.to_owned()) + } + }, + _ => return Err(ParseError::new("expected expression")), + }) + } + + /// Parse the body of a function. + fn parse_func_body(&mut self, header: &FuncHeader) -> ParseResult<Box<dyn Function>> { + // Whether the function has a body. + let has_body = self.tokens.peek() == Some(Token::LeftBracket); + if has_body { + self.advance(); + } + + // Now we want to parse this function dynamically. + let parser = self.ctx.scope.get_parser(&header.name) + .ok_or_else(|| ParseError::new(format!("unknown function: '{}'", &header.name)))?; + + // Do the parsing dependent on whether the function has a body. + Ok(if has_body { + // Find out the string which makes the body of this function. + let (start, end) = self.tokens.current_index().and_then(|index| { + find_closing_bracket(&self.src[index..]) + .map(|end| (index, index + end)) + }).ok_or_else(|| ParseError::new("expected closing bracket"))?; + + // Parse the body. + let body_string = &self.src[start .. end]; + let body = parser(&header, Some(body_string), self.ctx)?; + + // Skip to the end of the function in the token stream. + self.tokens.goto(end); + + // Now the body should be closed. + assert!(self.tokens.next() == Some(Token::RightBracket)); + + body + } else { + parser(&header, None, self.ctx)? + }) + } + + /// Parse whitespace (as long as there is any) and skip over comments. + fn parse_white(&mut self) -> ParseResult<()> { + while let Some(token) = self.tokens.peek() { + match self.state { + ParserState::FirstNewline => match token { + Token::Newline => { + self.append_consumed(Node::Newline); + self.switch(ParserState::WroteNewline); + }, + Token::Space => self.append_space_consumed(), + _ => { + self.append_space(); + self.switch(ParserState::Body); + }, + }, + ParserState::WroteNewline => match token { + Token::Newline | Token::Space => self.append_space_consumed(), + _ => self.switch(ParserState::Body), + }, + ParserState::Body => match token { + // Whitespace + Token::Space => self.append_space_consumed(), + Token::Newline => { + self.advance(); + self.switch(ParserState::FirstNewline); + }, + + // Comments + Token::LineComment(_) | Token::BlockComment(_) => self.advance(), + Token::StarSlash => { + return Err(ParseError::new("unexpected end of block comment")); + }, + + // Anything else skips out of the function. + _ => break, + } + } + } + + Ok(()) + } + + /// Skip over whitespace and comments. + fn skip_white(&mut self) { + while let Some(token) = self.tokens.peek() { + match token { + Token::Space | Token::Newline + | Token::LineComment(_) | Token::BlockComment(_) => self.advance(), + _ => break, + } + } + } + + /// Advance the iterator by one step. + fn advance(&mut self) { + self.tokens.next(); + } + + /// Switch the state. + fn switch(&mut self, state: ParserState) { + self.state = state; + } + + /// Append a node to the tree. + fn append(&mut self, node: Node) { + self.tree.nodes.push(node); + } + + /// Append a space if there is not one already. + fn append_space(&mut self) { + if self.tree.nodes.last() != Some(&Node::Space) { + self.append(Node::Space); + } + } + + /// Advance and return the given node. + fn append_consumed(&mut self, node: Node) { + self.advance(); + self.append(node); + } + + /// Advance and append a space if there is not one already. + fn append_space_consumed(&mut self) { + self.advance(); + self.append_space(); + } +} + +/// Find the index of the first unbalanced and unescaped closing bracket. +fn find_closing_bracket(src: &str) -> Option<usize> { + let mut parens = 0; + let mut escaped = false; + for (index, c) in src.char_indices() { + match c { + '\\' => { + escaped = !escaped; + continue; + }, + ']' if !escaped && parens == 0 => return Some(index), + '[' if !escaped => parens += 1, + ']' if !escaped => parens -= 1, + _ => {}, + } + escaped = false; + } + None +} + +/// A peekable iterator for tokens which allows access to the original iterator inside this module +/// (which is needed by the parser). +#[derive(Debug, Clone)] +struct PeekableTokens<'s> { + tokens: Tokens<'s>, + peeked: Option<Option<Token<'s>>>, +} + +impl<'s> PeekableTokens<'s> { + /// Create a new iterator from a string. + fn new(tokens: Tokens<'s>) -> PeekableTokens<'s> { + PeekableTokens { + tokens, + peeked: None, + } + } + + /// Peek at the next element. + fn peek(&mut self) -> Option<Token<'s>> { + let iter = &mut self.tokens; + *self.peeked.get_or_insert_with(|| iter.next()) + } + + /// The index of the first character of the next token in the source string. + fn current_index(&mut self) -> Option<usize> { + self.tokens.chars.current_index() + } + + /// Go to a new position in the underlying string. + fn goto(&mut self, index: usize) { + self.tokens.chars.goto(index); + self.peeked = None; + } +} + +impl<'s> Iterator for PeekableTokens<'s> { + type Item = Token<'s>; + + fn next(&mut self) -> Option<Token<'s>> { + match self.peeked.take() { + Some(value) => value, + None => self.tokens.next(), + } + } +} + +/// Whether this word is a valid unicode identifier. +fn is_identifier(string: &str) -> bool { + let mut chars = string.chars(); + + match chars.next() { + Some(c) if !UnicodeXID::is_xid_start(c) => return false, + None => return false, + _ => (), + } + + while let Some(c) = chars.next() { + if !UnicodeXID::is_xid_continue(c) { + return false; + } + } + + true +} + + +/// The error type for parsing. +pub struct ParseError(String); + +/// The result type for parsing. +pub type ParseResult<T> = Result<T, ParseError>; + +impl ParseError { + /// Create a new parse error with a message. + pub fn new<S: Into<String>>(message: S) -> ParseError { + ParseError(message.into()) + } +} + +error_type! { + err: ParseError, + show: f => f.write_str(&err.0), +} + + +#[cfg(test)] +mod tests { + use super::*; + use crate::func::{Function, Scope}; + use crate::layout::{LayoutContext, LayoutResult, Layout}; + use Node::{Space as S, Newline as N, Func as F}; + use funcs::*; + + /// Two test functions, one which parses it's body as another syntax tree and another one which + /// does not expect a body. + mod funcs { + use super::*; + + /// A testing function which just parses it's body into a syntax tree. + #[derive(Debug, PartialEq)] + pub struct TreeFn(pub SyntaxTree); + + impl Function for TreeFn { + fn parse(_: &FuncHeader, body: Option<&str>, ctx: ParseContext) + -> ParseResult<Self> where Self: Sized { + if let Some(src) = body { + parse(src, ctx).map(|tree| TreeFn(tree)) + } else { + Err(ParseError::new("expected body for tree fn")) + } + } + + fn layout(&self, _: LayoutContext) -> LayoutResult<Option<Layout>> { Ok(None) } + } + + /// A testing function without a body. + #[derive(Debug, PartialEq)] + pub struct BodylessFn; + + impl Function for BodylessFn { + fn parse(_: &FuncHeader, body: Option<&str>, _: ParseContext) + -> ParseResult<Self> where Self: Sized { + if body.is_none() { + Ok(BodylessFn) + } else { + Err(ParseError::new("unexpected body for bodyless fn")) + } + } + + fn layout(&self, _: LayoutContext) -> LayoutResult<Option<Layout>> { Ok(None) } + } + } + + /// Test if the source code parses into the syntax tree. + fn test(src: &str, tree: SyntaxTree) { + let ctx = ParseContext { scope: &Scope::new() }; + assert_eq!(parse(src, ctx).unwrap(), tree); + } + + /// Test with a scope containing function definitions. + fn test_scoped(scope: &Scope, src: &str, tree: SyntaxTree) { + let ctx = ParseContext { scope }; + assert_eq!(parse(src, ctx).unwrap(), tree); + } + + /// Test if the source parses into the error. + fn test_err(src: &str, err: &str) { + let ctx = ParseContext { scope: &Scope::new() }; + assert_eq!(parse(src, ctx).unwrap_err().to_string(), err); + } + + /// Test with a scope if the source parses into the error. + fn test_err_scoped(scope: &Scope, src: &str, err: &str) { + let ctx = ParseContext { scope }; + assert_eq!(parse(src, ctx).unwrap_err().to_string(), err); + } + + /// Create a text node. + #[allow(non_snake_case)] + fn T(s: &str) -> Node { Node::Text(s.to_owned()) } + + /// Shortcut macro to create a syntax tree. Is `vec`-like and the elements are the nodes. + macro_rules! tree { + ($($x:expr),*) => ( + SyntaxTree { nodes: vec![$($x),*] } + ); + ($($x:expr,)*) => (tree![$($x),*]) + } + + /// Shortcut macro to create a function. + macro_rules! func { + (name => $name:expr, body => None $(,)*) => { + func!(@$name, Box::new(BodylessFn)) + }; + (name => $name:expr, body => $tree:expr $(,)*) => { + func!(@$name, Box::new(TreeFn($tree))) + }; + (@$name:expr, $body:expr) => { + FuncCall { + header: FuncHeader { + name: $name.to_string(), + args: vec![], + kwargs: HashMap::new(), + }, + body: $body, + } + } + } + + /// Parse the basic cases. + #[test] + fn parse_base() { + test("", tree! []); + test("Hello World!", tree! [ T("Hello"), S, T("World!") ]); + } + + /// Test whether newlines generate the correct whitespace. + #[test] + fn parse_newlines_whitespace() { + test("Hello\nWorld", tree! [ T("Hello"), S, T("World") ]); + test("Hello \n World", tree! [ T("Hello"), S, T("World") ]); + test("Hello\n\nWorld", tree! [ T("Hello"), N, T("World") ]); + test("Hello \n\nWorld", tree! [ T("Hello"), S, N, T("World") ]); + test("Hello\n\n World", tree! [ T("Hello"), N, S, T("World") ]); + test("Hello \n \n \n World", tree! [ T("Hello"), S, N, S, T("World") ]); + test("Hello\n \n\n World", tree! [ T("Hello"), S, N, S, T("World") ]); + } + + /// Parse things dealing with functions. + #[test] + fn parse_functions() { + let mut scope = Scope::new(); + scope.add::<BodylessFn>("test"); + scope.add::<BodylessFn>("end"); + scope.add::<TreeFn>("modifier"); + scope.add::<TreeFn>("func"); + + test_scoped(&scope,"[test]", tree! [ F(func! { name => "test", body => None }) ]); + test_scoped(&scope,"[ test]", tree! [ F(func! { name => "test", body => None }) ]); + test_scoped(&scope, "This is an [modifier][example] of a function invocation.", tree! [ + T("This"), S, T("is"), S, T("an"), S, + F(func! { name => "modifier", body => tree! [ T("example") ] }), S, + T("of"), S, T("a"), S, T("function"), S, T("invocation.") + ]); + test_scoped(&scope, "[func][Hello][modifier][Here][end]", tree! [ + F(func! { name => "func", body => tree! [ T("Hello") ] }), + F(func! { name => "modifier", body => tree! [ T("Here") ] }), + F(func! { name => "end", body => None }), + ]); + test_scoped(&scope, "[func][]", tree! [ F(func! { name => "func", body => tree! [] }) ]); + test_scoped(&scope, "[modifier][[func][call]] outside", tree! [ + F(func! { + name => "modifier", + body => tree! [ F(func! { name => "func", body => tree! [ T("call") ] }) ], + }), + S, T("outside") + ]); + + } + + /// Parse functions with arguments. + #[test] + fn parse_function_args() { + use Expression::{Number as N, Size as Z, Bool as B}; + + #[allow(non_snake_case)] + fn S(string: &str) -> Expression { Expression::Str(string.to_owned()) } + #[allow(non_snake_case)] + fn I(string: &str) -> Expression { Expression::Ident(string.to_owned()) } + + fn func(name: &str, args: Vec<Expression>) -> SyntaxTree { + tree! [ F(FuncCall { + header: FuncHeader { + name: name.to_string(), + args, + kwargs: HashMap::new(), + }, + body: Box::new(BodylessFn) + }) ] + } + + let mut scope = Scope::new(); + scope.add::<BodylessFn>("align"); + + test_scoped(&scope, "[align: left]", func("align", vec![I("left")])); + test_scoped(&scope, "[align: left,right]", func("align", vec![I("left"), I("right")])); + test_scoped(&scope, "[align: left, right]", func("align", vec![I("left"), I("right")])); + test_scoped(&scope, "[align: \"hello\"]", func("align", vec![S("hello")])); + test_scoped(&scope, r#"[align: "hello\"world"]"#, func("align", vec![S(r#"hello\"world"#)])); + test_scoped(&scope, "[align: 12]", func("align", vec![N(12.0)])); + test_scoped(&scope, "[align: 17.53pt]", func("align", vec![Z(Size::pt(17.53))])); + test_scoped(&scope, "[align: 2.4in]", func("align", vec![Z(Size::inches(2.4))])); + test_scoped(&scope, "[align: true, 10mm, left, \"hi, there\"]", + func("align", vec![B(true), Z(Size::mm(10.0)), I("left"), S("hi, there")])); + } + + /// Parse comments (line and block). + #[test] + fn parse_comments() { + let mut scope = Scope::new(); + scope.add::<BodylessFn>("test"); + scope.add::<TreeFn>("func"); + + test_scoped(&scope, "Text\n// Comment\n More text", + tree! [ T("Text"), S, T("More"), S, T("text") ]); + test_scoped(&scope, "[test/*world*/]", + tree! [ F(func! { name => "test", body => None }) ]); + test_scoped(&scope, "[test/*]*/]", + tree! [ F(func! { name => "test", body => None }) ]); + } + + /// Test if escaped, but unbalanced parens are correctly parsed. + #[test] + fn parse_unbalanced_body_parens() { + let mut scope = Scope::new(); + scope.add::<TreeFn>("code"); + + test_scoped(&scope, r"My [code][Close \]] end", tree! [ + T("My"), S, F(func! { + name => "code", + body => tree! [ T("Close"), S, T("]") ] + }), S, T("end") + ]); + test_scoped(&scope, r"My [code][\[ Open] end", tree! [ + T("My"), S, F(func! { + name => "code", + body => tree! [ T("["), S, T("Open") ] + }), S, T("end") + ]); + test_scoped(&scope, r"My [code][Open \] and \[ close]end", tree! [ + T("My"), S, F(func! { + name => "code", + body => tree! [ T("Open"), S, T("]"), S, T("and"), S, T("["), S, T("close") ] + }), T("end") + ]); + } + + /// Tests if the parser handles non-ASCII stuff correctly. + #[test] + fn parse_unicode() { + let mut scope = Scope::new(); + scope.add::<BodylessFn>("func"); + scope.add::<TreeFn>("bold"); + + test_scoped(&scope, "[func] βΊ.", tree! [ + F(func! { + name => "func", + body => None, + }), + S, T("βΊ.") + ]); + test_scoped(&scope, "[bold][Hello π!]", tree! [ + F(func! { + name => "bold", + body => tree! [ T("Hello"), S, T("π!") ], + }) + ]); + } + + /// Tests whether errors get reported correctly. + #[test] + fn parse_errors() { + let mut scope = Scope::new(); + scope.add::<TreeFn>("hello"); + + test_err("No functions here]", "unexpected closing bracket"); + test_err_scoped(&scope, "[hello][world", "expected closing bracket"); + test_err("[hello world", "expected function arguments or closing bracket"); + test_err("[ no-name][Why?]", "invalid identifier: 'no-name'"); + test_err("Hello */", "unexpected end of block comment"); + } +} |
