summaryrefslogtreecommitdiff
path: root/src/parse
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2021-06-09 00:37:13 +0200
committerLaurenz <laurmaedje@gmail.com>2021-06-09 00:37:13 +0200
commit5afb42ad89abb518a01a09051f0f9b6f75bd383e (patch)
treeb12368a287f22de711df8d759c20ee742ed5b4c2 /src/parse
parentd69dfa84ec957ac4037f60a3335416a9f73b97c8 (diff)
Lists with indent-based parsing
- Unordered lists with indent-based parsing and basic layout using stacks - Headings are now also indent based - Removes syntax functions since they will be superseded by select & transform
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/lines.rs28
-rw-r--r--src/parse/mod.rs147
-rw-r--r--src/parse/parser.rs124
-rw-r--r--src/parse/scanner.rs2
-rw-r--r--src/parse/tokens.rs508
5 files changed, 439 insertions, 370 deletions
diff --git a/src/parse/lines.rs b/src/parse/lines.rs
index 8693af44..bbdedaa5 100644
--- a/src/parse/lines.rs
+++ b/src/parse/lines.rs
@@ -32,6 +32,8 @@ impl<'s> LineMap<'s> {
let start = self.line_starts.get(line_index)?;
let head = self.src.get(start.to_usize() .. pos.to_usize())?;
+
+ // TODO: What about tabs?
let column_index = head.chars().count();
Some(Location {
@@ -52,12 +54,14 @@ impl<'s> LineMap<'s> {
let line = self.src.get(line_start.to_usize() .. line_end)?;
- // Find the index in the line. For the first column, the index is always zero. For
- // other columns, we have to look at which byte the char directly before the
- // column in question ends. We can't do `nth(column_idx)` directly since the
- // column may be behind the last char.
+ // Find the index in the line. For the first column, the index is always
+ // zero. For other columns, we have to look at which byte the char
+ // directly before the column in question ends. We can't do
+ // `nth(column_idx)` directly since the column may be behind the last
+ // char.
let column_idx = location.column.checked_sub(1)? as usize;
let line_offset = if let Some(prev_idx) = column_idx.checked_sub(1) {
+ // TODO: What about tabs?
let (idx, prev) = line.char_indices().nth(prev_idx)?;
idx + prev.len_utf8()
} else {
@@ -68,6 +72,22 @@ impl<'s> LineMap<'s> {
}
}
+/// Determine the column at the end of the string.
+pub fn search_column(src: &str) -> usize {
+ let mut column = 0;
+ for c in src.chars().rev() {
+ if is_newline(c) {
+ break;
+ } else if c == '\t' {
+ // TODO: How many columns per tab?
+ column += 2;
+ } else {
+ column += 1;
+ }
+ }
+ column
+}
+
/// Whether this character denotes a newline.
pub fn is_newline(character: char) -> bool {
matches!(
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 1b32b31e..048bcb1c 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -25,14 +25,32 @@ pub fn parse(src: &str) -> Pass<Tree> {
/// Parse a syntax tree.
fn tree(p: &mut Parser) -> Tree {
+ tree_while(p, |_| true)
+}
+
+/// Parse a syntax tree that stays right of the column at the start of the next
+/// non-whitespace token.
+fn tree_indented(p: &mut Parser) -> Tree {
+ p.skip_white();
+ let column = p.column(p.next_start());
+ tree_while(p, |p| match p.peek() {
+ Some(Token::Space(n)) if n >= 1 => p.column(p.next_end()) >= column,
+ _ => true,
+ })
+}
+
+/// Parse a syntax tree.
+fn tree_while(p: &mut Parser, mut f: impl FnMut(&mut Parser) -> bool) -> Tree {
// We keep track of whether we are at the start of a block or paragraph
- // to know whether headings are allowed.
+ // to know whether things like headings are allowed.
let mut at_start = true;
let mut tree = vec![];
- while !p.eof() {
+ while !p.eof() && f(p) {
if let Some(node) = node(p, &mut at_start) {
- if !matches!(node, Node::Parbreak(_) | Node::Space) {
- at_start = false;
+ match node {
+ Node::Space => {}
+ Node::Parbreak(_) => {}
+ _ => at_start = false,
}
tree.push(node);
}
@@ -57,10 +75,16 @@ fn node(p: &mut Parser, at_start: &mut bool) -> Option<Node> {
// Text.
Token::Text(text) => Node::Text(text.into()),
+ Token::Tilde => Node::Text("\u{00A0}".into()),
+ Token::HyphHyph => Node::Text("\u{2013}".into()),
+ Token::HyphHyphHyph => Node::Text("\u{2014}".into()),
+ Token::UnicodeEscape(t) => Node::Text(unicode_escape(p, t)),
// Markup.
+ Token::Backslash => Node::Linebreak(span),
Token::Star => Node::Strong(span),
Token::Underscore => Node::Emph(span),
+ Token::Raw(t) => raw(p, t),
Token::Hashtag => {
if *at_start {
return Some(heading(p));
@@ -68,10 +92,13 @@ fn node(p: &mut Parser, at_start: &mut bool) -> Option<Node> {
Node::Text(p.peek_src().into())
}
}
- Token::Tilde => Node::Text("\u{00A0}".into()),
- Token::Backslash => Node::Linebreak(span),
- Token::Raw(t) => raw(p, t),
- Token::UnicodeEscape(t) => Node::Text(unicode_escape(p, t)),
+ Token::Hyph => {
+ if *at_start {
+ return Some(list(p));
+ } else {
+ Node::Text(p.peek_src().into())
+ }
+ }
// Hashtag + keyword / identifier.
Token::Ident(_)
@@ -81,31 +108,27 @@ fn node(p: &mut Parser, at_start: &mut bool) -> Option<Node> {
| Token::For
| Token::Import
| Token::Include => {
- *at_start = false;
let stmt = matches!(token, Token::Let | Token::Import);
let group = if stmt { Group::Stmt } else { Group::Expr };
p.start_group(group, TokenMode::Code);
let expr = expr_with(p, true, 0);
if stmt && expr.is_some() && !p.eof() {
- p.expected_at("semicolon or line break", p.end());
+ p.expected_at("semicolon or line break", p.prev_end());
}
p.end_group();
// Uneat spaces we might have eaten eagerly.
- p.jump(p.end());
return expr.map(Node::Expr);
}
// Block.
Token::LeftBrace => {
- *at_start = false;
return Some(Node::Expr(block(p, false)));
}
// Template.
Token::LeftBracket => {
- *at_start = false;
return Some(Node::Expr(template(p)));
}
@@ -125,9 +148,37 @@ fn node(p: &mut Parser, at_start: &mut bool) -> Option<Node> {
Some(node)
}
+/// Handle a unicode escape sequence.
+fn unicode_escape(p: &mut Parser, token: UnicodeEscapeToken) -> String {
+ let span = p.peek_span();
+ let text = if let Some(c) = resolve::resolve_hex(token.sequence) {
+ c.to_string()
+ } else {
+ // Print out the escape sequence verbatim if it is invalid.
+ p.diag(error!(span, "invalid unicode escape sequence"));
+ p.peek_src().into()
+ };
+
+ if !token.terminated {
+ p.diag(error!(span.end, "expected closing brace"));
+ }
+
+ text
+}
+
+/// Handle a raw block.
+fn raw(p: &mut Parser, token: RawToken) -> Node {
+ let span = p.peek_span();
+ let raw = resolve::resolve_raw(span, token.text, token.backticks);
+ if !token.terminated {
+ p.diag(error!(p.peek_span().end, "expected backtick(s)"));
+ }
+ Node::Raw(raw)
+}
+
/// Parse a heading.
fn heading(p: &mut Parser) -> Node {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::Hashtag);
// Count depth.
@@ -137,49 +188,25 @@ fn heading(p: &mut Parser) -> Node {
}
if level > 6 {
- p.diag(warning!(start .. p.end(), "should not exceed depth 6"));
+ p.diag(warning!(start .. p.prev_end(), "should not exceed depth 6"));
level = 6;
}
- // Parse the heading contents.
- let mut tree = vec![];
- while p.check(|t| !matches!(t, Token::Space(n) if n >= 1)) {
- tree.extend(node(p, &mut false));
- }
+ let body = tree_indented(p);
Node::Heading(HeadingNode {
span: p.span(start),
level,
- contents: Rc::new(tree),
+ body: Rc::new(body),
})
}
-/// Handle a raw block.
-fn raw(p: &mut Parser, token: RawToken) -> Node {
- let span = p.peek_span();
- let raw = resolve::resolve_raw(span, token.text, token.backticks);
- if !token.terminated {
- p.diag(error!(p.peek_span().end, "expected backtick(s)"));
- }
- Node::Raw(raw)
-}
-
-/// Handle a unicode escape sequence.
-fn unicode_escape(p: &mut Parser, token: UnicodeEscapeToken) -> String {
- let span = p.peek_span();
- let text = if let Some(c) = resolve::resolve_hex(token.sequence) {
- c.to_string()
- } else {
- // Print out the escape sequence verbatim if it is invalid.
- p.diag(error!(span, "invalid unicode escape sequence"));
- p.get(span).into()
- };
-
- if !token.terminated {
- p.diag(error!(span.end, "expected closing brace"));
- }
-
- text
+/// Parse a single list item.
+fn list(p: &mut Parser) -> Node {
+ let start = p.next_start();
+ p.assert(Token::Hyph);
+ let body = tree_indented(p);
+ Node::List(ListNode { span: p.span(start), body })
}
/// Parse an expression.
@@ -195,7 +222,7 @@ fn expr(p: &mut Parser) -> Option<Expr> {
///
/// Stops parsing at operations with lower precedence than `min_prec`,
fn expr_with(p: &mut Parser, atomic: bool, min_prec: usize) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
let mut lhs = match p.eat_map(UnOp::from_token) {
Some(op) => {
let prec = op.precedence();
@@ -383,7 +410,7 @@ fn collection(p: &mut Parser) -> (Vec<CallArg>, bool) {
break;
}
- let behind = p.end();
+ let behind = p.prev_end();
if p.eat_if(Token::Comma) {
has_comma = true;
} else {
@@ -467,7 +494,7 @@ fn block(p: &mut Parser, scoping: bool) -> Expr {
if let Some(expr) = expr(p) {
exprs.push(expr);
if !p.eof() {
- p.expected_at("semicolon or line break", p.end());
+ p.expected_at("semicolon or line break", p.prev_end());
}
}
p.end_group();
@@ -506,14 +533,14 @@ fn call(p: &mut Parser, callee: Expr) -> Expr {
/// Parse the arguments to a function call.
fn args(p: &mut Parser) -> CallArgs {
- let start = p.start();
+ let start = p.next_start();
let items = collection(p).0;
CallArgs { span: p.span(start), items }
}
/// Parse a let expression.
fn expr_let(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::Let);
let mut expr_let = None;
@@ -532,7 +559,7 @@ fn expr_let(p: &mut Parser) -> Option<Expr> {
init = expr(p);
} else if params.is_some() {
// Function definitions must have a body.
- p.expected_at("body", p.end());
+ p.expected_at("body", p.prev_end());
}
// Rewrite into a closure expression if it's a function definition.
@@ -558,7 +585,7 @@ fn expr_let(p: &mut Parser) -> Option<Expr> {
/// Parse an if expresion.
fn expr_if(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::If);
let mut expr_if = None;
@@ -589,7 +616,7 @@ fn expr_if(p: &mut Parser) -> Option<Expr> {
/// Parse a while expresion.
fn expr_while(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::While);
let mut expr_while = None;
@@ -608,7 +635,7 @@ fn expr_while(p: &mut Parser) -> Option<Expr> {
/// Parse a for expression.
fn expr_for(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::For);
let mut expr_for = None;
@@ -643,7 +670,7 @@ fn for_pattern(p: &mut Parser) -> Option<ForPattern> {
/// Parse an import expression.
fn expr_import(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::Import);
let mut expr_import = None;
@@ -657,7 +684,7 @@ fn expr_import(p: &mut Parser) -> Option<Expr> {
p.start_group(Group::Expr, TokenMode::Code);
let items = collection(p).0;
if items.is_empty() {
- p.expected_at("import items", p.end());
+ p.expected_at("import items", p.prev_end());
}
let idents = idents(p, items);
@@ -680,7 +707,7 @@ fn expr_import(p: &mut Parser) -> Option<Expr> {
/// Parse an include expression.
fn expr_include(p: &mut Parser) -> Option<Expr> {
- let start = p.start();
+ let start = p.next_start();
p.assert(Token::Include);
expr(p).map(|path| {
@@ -710,7 +737,7 @@ fn body(p: &mut Parser) -> Option<Expr> {
Some(Token::LeftBracket) => Some(template(p)),
Some(Token::LeftBrace) => Some(block(p, true)),
_ => {
- p.expected_at("body", p.end());
+ p.expected_at("body", p.prev_end());
None
}
}
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
index 6269ad73..27346587 100644
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@@ -1,6 +1,7 @@
use std::fmt::{self, Debug, Formatter};
+use std::ops::Range;
-use super::{Scanner, TokenMode, Tokens};
+use super::{search_column, TokenMode, Tokens};
use crate::diag::{Diag, DiagSet};
use crate::syntax::{Pos, Span, Token};
@@ -17,10 +18,10 @@ pub struct Parser<'s> {
/// The peeked token.
/// (Same as `next` except if we are at the end of group, then `None`).
peeked: Option<Token<'s>>,
- /// The start position of the peeked token.
- next_start: Pos,
/// The end position of the last (non-whitespace if in code mode) token.
- last_end: Pos,
+ prev_end: usize,
+ /// The start position of the peeked token.
+ next_start: usize,
}
/// A logical group of tokens, e.g. `[...]`.
@@ -28,7 +29,7 @@ pub struct Parser<'s> {
struct GroupEntry {
/// The start position of the group. Used by `Parser::end_group` to return
/// The group's full span.
- pub start: Pos,
+ pub start: usize,
/// The kind of group this is. This decides which tokens will end the group.
/// For example, a [`Group::Paren`] will be ended by
/// [`Token::RightParen`].
@@ -59,12 +60,12 @@ impl<'s> Parser<'s> {
let next = tokens.next();
Self {
diags: DiagSet::new(),
- next,
tokens,
- last_end: Pos::ZERO,
- peeked: next,
- next_start: Pos::ZERO,
groups: vec![],
+ next,
+ peeked: next,
+ prev_end: 0,
+ next_start: 0,
}
}
@@ -76,9 +77,9 @@ impl<'s> Parser<'s> {
/// Eat the next token and add a diagnostic that it is not the expected
/// `thing`.
pub fn expected(&mut self, what: &str) {
- let before = self.next_start;
+ let before = self.next_start();
if let Some(found) = self.eat() {
- let after = self.last_end;
+ let after = self.prev_end();
self.diag(error!(
before .. after,
"expected {}, found {}",
@@ -86,20 +87,20 @@ impl<'s> Parser<'s> {
found.name(),
));
} else {
- self.expected_at(what, self.next_start);
+ self.expected_at(what, self.next_start());
}
}
/// Add a diagnostic that `what` was expected at the given position.
- pub fn expected_at(&mut self, what: &str, pos: Pos) {
- self.diag(error!(pos, "expected {}", what));
+ pub fn expected_at(&mut self, what: &str, pos: impl Into<Pos>) {
+ self.diag(error!(pos.into(), "expected {}", what));
}
/// Eat the next token and add a diagnostic that it is unexpected.
pub fn unexpected(&mut self) {
- let before = self.next_start;
+ let before = self.next_start();
if let Some(found) = self.eat() {
- let after = self.last_end;
+ let after = self.prev_end();
self.diag(error!(before .. after, "unexpected {}", found.name()));
}
}
@@ -110,11 +111,10 @@ impl<'s> Parser<'s> {
/// `eat()` and `peek()` return `None`. Parsing can only continue with
/// a matching call to `end_group`.
///
- /// # Panics
/// This panics if the next token does not start the given group.
pub fn start_group(&mut self, kind: Group, mode: TokenMode) {
self.groups.push(GroupEntry {
- start: self.next_start,
+ start: self.next_start(),
kind,
outer_mode: self.tokens.mode(),
});
@@ -133,7 +133,6 @@ impl<'s> Parser<'s> {
/// End the parsing of a group.
///
- /// # Panics
/// This panics if no group was started.
pub fn end_group(&mut self) -> Span {
let prev_mode = self.tokens.mode();
@@ -156,17 +155,16 @@ impl<'s> Parser<'s> {
self.bump();
rescan = false;
} else if required {
- self.diag(error!(self.next_start, "expected {}", end.name()));
+ self.diag(error!(self.next_start(), "expected {}", end.name()));
}
}
// Rescan the peeked token if the mode changed.
if rescan {
- self.tokens.jump(self.last_end);
- self.bump();
+ self.jump(self.prev_end());
}
- Span::new(group.start, self.last_end)
+ Span::new(group.start, self.prev_end())
}
/// The tokenization mode outside of the current group.
@@ -193,7 +191,7 @@ impl<'s> Parser<'s> {
/// Peek at the next token if it follows immediately after the last one
/// without any whitespace in between.
pub fn peek_direct(&self) -> Option<Token<'s>> {
- if self.next_start == self.last_end {
+ if self.next_start() == self.prev_end() {
self.peeked
} else {
None
@@ -204,15 +202,17 @@ impl<'s> Parser<'s> {
///
/// Has length zero if `peek()` returns `None`.
pub fn peek_span(&self) -> Span {
- Span::new(
- self.next_start,
- if self.eof() { self.next_start } else { self.tokens.pos() },
- )
+ self.peek_range().into()
}
/// Peek at the source of the next token.
pub fn peek_src(&self) -> &'s str {
- self.get(self.peek_span())
+ self.tokens.scanner().get(self.peek_range())
+ }
+
+ /// Peek at the source range (start and end index) of the next token.
+ pub fn peek_range(&self) -> Range<usize> {
+ self.next_start() .. self.next_end()
}
/// Checks whether the next token fulfills a condition.
@@ -255,11 +255,11 @@ impl<'s> Parser<'s> {
mapped
}
- /// Eat the next token and return its span.
+ /// Eat the next token and return its source range.
pub fn eat_span(&mut self) -> Span {
- let start = self.next_start;
+ let start = self.next_start();
self.eat();
- Span::new(start, self.last_end)
+ Span::new(start, self.prev_end())
}
/// Consume the next token if it is the given one and produce a diagnostic
@@ -267,7 +267,7 @@ impl<'s> Parser<'s> {
pub fn expect(&mut self, t: Token) -> bool {
let eaten = self.eat_if(t);
if !eaten {
- self.expected_at(t.name(), self.last_end);
+ self.expected_at(t.name(), self.prev_end());
}
eaten
}
@@ -290,45 +290,48 @@ impl<'s> Parser<'s> {
}
}
- /// The position at which the next token starts.
- pub fn start(&self) -> Pos {
- self.next_start
- }
-
- /// The position at which the last token ended.
+ /// The index at which the last token ended.
///
/// Refers to the end of the last _non-whitespace_ token in code mode.
- pub fn end(&self) -> Pos {
- self.last_end
+ pub fn prev_end(&self) -> usize {
+ self.prev_end
}
- /// The span from `start` to the end of the last token.
- pub fn span(&self, start: Pos) -> Span {
- Span::new(start, self.last_end)
+ /// The index at which the next token starts.
+ pub fn next_start(&self) -> usize {
+ self.next_start
}
- /// Jump to a position in the source string.
- pub fn jump(&mut self, pos: Pos) {
- self.tokens.jump(pos);
- self.bump();
+ /// The index at which the next token will end.
+ ///
+ /// Is the same as [`next_start()`][Self::next_start] if `peek()` returns
+ /// `None`.
+ pub fn next_end(&self) -> usize {
+ self.tokens.index()
}
- /// Slice a part out of the source string.
- pub fn get(&self, span: impl Into<Span>) -> &'s str {
- self.tokens.scanner().get(span.into().to_range())
+ /// Determine the column for the given index in the source.
+ pub fn column(&self, index: usize) -> usize {
+ search_column(self.tokens.scanner().get(.. index))
}
- /// The underlying scanner.
- pub fn scanner(&self) -> Scanner<'s> {
- let mut scanner = self.tokens.scanner().clone();
- scanner.jump(self.next_start.to_usize());
- scanner
+ /// The span from `start` to [`self.prev_end()`](Self::prev_end).
+ pub fn span(&self, start: impl Into<Pos>) -> Span {
+ Span::new(start, self.prev_end())
+ }
+
+ /// Jump to an index in the string.
+ ///
+ /// You need to know the correct column.
+ fn jump(&mut self, index: usize) {
+ self.tokens.jump(index);
+ self.bump();
}
/// Move to the next token.
fn bump(&mut self) {
- self.last_end = self.tokens.pos();
- self.next_start = self.tokens.pos();
+ self.prev_end = self.tokens.index();
+ self.next_start = self.tokens.index();
self.next = self.tokens.next();
if self.tokens.mode() == TokenMode::Code {
@@ -339,7 +342,7 @@ impl<'s> Parser<'s> {
Some(Token::BlockComment(_)) => true,
_ => false,
} {
- self.next_start = self.tokens.pos();
+ self.next_start = self.tokens.index();
self.next = self.tokens.next();
}
}
@@ -381,7 +384,8 @@ impl<'s> Parser<'s> {
impl Debug for Parser<'_> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
- let s = self.scanner();
+ let mut s = self.tokens.scanner();
+ s.jump(self.next_start());
write!(f, "Parser({}|{})", s.eaten(), s.rest())
}
}
diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs
index cc23a612..1f262e63 100644
--- a/src/parse/scanner.rs
+++ b/src/parse/scanner.rs
@@ -2,7 +2,7 @@ use std::fmt::{self, Debug, Formatter};
use std::slice::SliceIndex;
/// A featureful char-based scanner.
-#[derive(Clone)]
+#[derive(Copy, Clone)]
pub struct Scanner<'s> {
src: &'s str,
index: usize,
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index fa86d2f1..74051801 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -38,20 +38,22 @@ impl<'s> Tokens<'s> {
self.mode = mode;
}
- /// The position in the string at which the last token ends and next token
+ /// The index in the string at which the last token ends and next token
/// will start.
- pub fn pos(&self) -> Pos {
- self.s.index().into()
+ pub fn index(&self) -> usize {
+ self.s.index()
}
- /// Jump to the given position.
- pub fn jump(&mut self, pos: Pos) {
- self.s.jump(pos.to_usize());
+ /// Jump to the given index in the string.
+ ///
+ /// You need to know the correct column.
+ pub fn jump(&mut self, index: usize) {
+ self.s.jump(index);
}
/// The underlying scanner.
- pub fn scanner(&self) -> &Scanner<'s> {
- &self.s
+ pub fn scanner(&self) -> Scanner<'s> {
+ self.s
}
}
@@ -62,126 +64,100 @@ impl<'s> Iterator for Tokens<'s> {
fn next(&mut self) -> Option<Self::Item> {
let start = self.s.index();
let c = self.s.eat()?;
-
- // This never loops. It just exists to allow breaking out of it.
- loop {
- // Common elements.
- return Some(match c {
- // Blocks and templates.
- '[' => Token::LeftBracket,
- ']' => Token::RightBracket,
- '{' => Token::LeftBrace,
- '}' => Token::RightBrace,
-
- // Headings, keywords, identifiers, colors.
- '#' => self.hash(start),
-
- // Whitespace.
- c if c.is_whitespace() => self.whitespace(c),
-
- // Comments.
- '/' if self.s.eat_if('/') => self.line_comment(),
- '/' if self.s.eat_if('*') => self.block_comment(),
- '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)),
-
- _ => break,
- });
- }
-
- Some(match self.mode {
- TokenMode::Markup => match c {
- // Markup.
- '*' => Token::Star,
- '_' => Token::Underscore,
- '~' => Token::Tilde,
- '`' => self.raw(),
- '$' => self.math(),
- '\\' => self.backslash(),
-
- // Plain text.
- _ => self.text(start),
- },
-
- TokenMode::Code => match c {
- // Parens.
- '(' => Token::LeftParen,
- ')' => Token::RightParen,
-
- // Length two.
- '=' if self.s.eat_if('=') => Token::EqEq,
- '!' if self.s.eat_if('=') => Token::BangEq,
- '<' if self.s.eat_if('=') => Token::LtEq,
- '>' if self.s.eat_if('=') => Token::GtEq,
- '+' if self.s.eat_if('=') => Token::PlusEq,
- '-' if self.s.eat_if('=') => Token::HyphEq,
- '*' if self.s.eat_if('=') => Token::StarEq,
- '/' if self.s.eat_if('=') => Token::SlashEq,
- '.' if self.s.eat_if('.') => Token::Dots,
- '=' if self.s.eat_if('>') => Token::Arrow,
-
- // Length one.
- ',' => Token::Comma,
- ';' => Token::Semicolon,
- ':' => Token::Colon,
- '+' => Token::Plus,
- '-' => Token::Hyph,
- '*' => Token::Star,
- '/' => Token::Slash,
- '=' => Token::Eq,
- '<' => Token::Lt,
- '>' => Token::Gt,
-
- // Identifiers.
- c if is_id_start(c) => self.ident(start),
-
- // Numbers.
- c if c.is_ascii_digit()
- || (c == '.' && self.s.check(|n| n.is_ascii_digit())) =>
- {
- self.number(start, c)
- }
-
- // Strings.
- '"' => self.string(),
-
- _ => Token::Invalid(self.s.eaten_from(start)),
+ Some(match c {
+ // Blocks and templates.
+ '[' => Token::LeftBracket,
+ ']' => Token::RightBracket,
+ '{' => Token::LeftBrace,
+ '}' => Token::RightBrace,
+
+ // Headings, keywords, identifiers, colors.
+ '#' => self.hash(start),
+
+ // Whitespace.
+ c if c.is_whitespace() => self.whitespace(c),
+
+ // Comments.
+ '/' if self.s.eat_if('/') => self.line_comment(),
+ '/' if self.s.eat_if('*') => self.block_comment(),
+ '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)),
+
+ // Other things.
+ _ => match self.mode {
+ TokenMode::Markup => self.markup(start, c),
+ TokenMode::Code => self.code(start, c),
},
})
}
}
impl<'s> Tokens<'s> {
- fn hash(&mut self, start: usize) -> Token<'s> {
- let read = self.s.eat_while(is_id_continue);
-
- match self.mode {
- TokenMode::Markup => {
- if read.is_empty() {
- return Token::Hashtag;
- }
-
- if let Some(token) = keyword(read) {
- return token;
- }
+ fn markup(&mut self, start: usize, c: char) -> Token<'s> {
+ match c {
+ // Markup.
+ '~' => Token::Tilde,
+ '*' => Token::Star,
+ '_' => Token::Underscore,
+ '\\' => self.backslash(),
+ '`' => self.raw(),
+ '$' => self.math(),
+ '-' => self.hyph(start),
+
+ // Plain text.
+ _ => self.text(start),
+ }
+ }
- if read.chars().next().map_or(false, is_id_start) {
- return Token::Ident(read);
- }
+ fn code(&mut self, start: usize, c: char) -> Token<'s> {
+ match c {
+ // Parens.
+ '(' => Token::LeftParen,
+ ')' => Token::RightParen,
+
+ // Length two.
+ '=' if self.s.eat_if('=') => Token::EqEq,
+ '!' if self.s.eat_if('=') => Token::BangEq,
+ '<' if self.s.eat_if('=') => Token::LtEq,
+ '>' if self.s.eat_if('=') => Token::GtEq,
+ '+' if self.s.eat_if('=') => Token::PlusEq,
+ '-' if self.s.eat_if('=') => Token::HyphEq,
+ '*' if self.s.eat_if('=') => Token::StarEq,
+ '/' if self.s.eat_if('=') => Token::SlashEq,
+ '.' if self.s.eat_if('.') => Token::Dots,
+ '=' if self.s.eat_if('>') => Token::Arrow,
+
+ // Length one.
+ ',' => Token::Comma,
+ ';' => Token::Semicolon,
+ ':' => Token::Colon,
+ '+' => Token::Plus,
+ '-' => Token::Hyph,
+ '*' => Token::Star,
+ '/' => Token::Slash,
+ '=' => Token::Eq,
+ '<' => Token::Lt,
+ '>' => Token::Gt,
+
+ // Identifiers.
+ c if is_id_start(c) => self.ident(start),
+
+ // Numbers.
+ c if c.is_ascii_digit()
+ || (c == '.' && self.s.check(|n| n.is_ascii_digit())) =>
+ {
+ self.number(start, c)
}
- TokenMode::Code => {
- if let Ok(color) = RgbaColor::from_str(read) {
- return Token::Color(color);
- }
- }
- }
+ // Strings.
+ '"' => self.string(),
- Token::Invalid(self.s.eaten_from(start))
+ _ => Token::Invalid(self.s.eaten_from(start)),
+ }
}
fn whitespace(&mut self, first: char) -> Token<'s> {
// Fast path for just a single space
- if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
+ if first == ' ' && !self.s.check(char::is_whitespace) {
Token::Space(0)
} else {
self.s.uneat();
@@ -210,12 +186,13 @@ impl<'s> Tokens<'s> {
c if c.is_whitespace() => true,
// Comments.
'/' if self.s.check(|c| c == '/' || c == '*') => true,
- // Parenthesis and hashtag.
- '[' | ']' | '{' | '}' | '#' => true,
+ // Parentheses.
+ '[' | ']' | '{' | '}' => true,
// Markup.
- '*' | '_' | '=' | '~' | '`' | '$' => true,
+ '#' | '~' | '*' | '_' | '-' | '`' | '$' => true,
// Escaping.
'\\' => true,
+ // Just text.
_ => false,
} {
self.s.uneat();
@@ -226,6 +203,77 @@ impl<'s> Tokens<'s> {
Token::Text(self.s.eaten_from(start))
}
+ fn backslash(&mut self) -> Token<'s> {
+ if let Some(c) = self.s.peek() {
+ match c {
+ // Backslash and comments.
+ '\\' | '/' |
+ // Parenthesis and hashtag.
+ '[' | ']' | '{' | '}' | '#' |
+ // Markup.
+ '*' | '_' | '=' | '~' | '`' | '$' => {
+ let start = self.s.index();
+ self.s.eat_assert(c);
+ Token::Text(&self.s.eaten_from(start))
+ }
+ 'u' if self.s.peek_nth(1) == Some('{') => {
+ self.s.eat_assert('u');
+ self.s.eat_assert('{');
+ Token::UnicodeEscape(UnicodeEscapeToken {
+ // Allow more than `ascii_hexdigit` for better error recovery.
+ sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
+ terminated: self.s.eat_if('}'),
+ })
+ }
+ c if c.is_whitespace() => Token::Backslash,
+ _ => Token::Text("\\"),
+ }
+ } else {
+ Token::Backslash
+ }
+ }
+
+ fn hash(&mut self, start: usize) -> Token<'s> {
+ match self.mode {
+ TokenMode::Markup => {
+ if self.s.check(is_id_start) {
+ let read = self.s.eat_while(is_id_continue);
+ if let Some(keyword) = keyword(read) {
+ keyword
+ } else {
+ Token::Ident(read)
+ }
+ } else if self.s.check(|c| c != '#' && !c.is_whitespace()) {
+ Token::Text(self.s.eaten_from(start))
+ } else {
+ Token::Hashtag
+ }
+ }
+ TokenMode::Code => {
+ let read = self.s.eat_while(is_id_continue);
+ if let Ok(color) = RgbaColor::from_str(read) {
+ Token::Color(color)
+ } else {
+ Token::Invalid(self.s.eaten_from(start))
+ }
+ }
+ }
+ }
+
+ fn hyph(&mut self, start: usize) -> Token<'s> {
+ if self.s.eat_if('-') {
+ if self.s.eat_if('-') {
+ Token::HyphHyphHyph
+ } else {
+ Token::HyphHyph
+ }
+ } else if self.s.check(|c| !c.is_whitespace()) {
+ Token::Text(self.s.eaten_from(start))
+ } else {
+ Token::Hyph
+ }
+ }
+
fn raw(&mut self) -> Token<'s> {
let mut backticks = 1;
while self.s.eat_if('`') {
@@ -295,36 +343,6 @@ impl<'s> Tokens<'s> {
})
}
- fn backslash(&mut self) -> Token<'s> {
- if let Some(c) = self.s.peek() {
- match c {
- // Backslash and comments.
- '\\' | '/' |
- // Parenthesis and hashtag.
- '[' | ']' | '{' | '}' | '#' |
- // Markup.
- '*' | '_' | '=' | '~' | '`' | '$' => {
- let start = self.s.index();
- self.s.eat_assert(c);
- Token::Text(&self.s.eaten_from(start))
- }
- 'u' if self.s.peek_nth(1) == Some('{') => {
- self.s.eat_assert('u');
- self.s.eat_assert('{');
- Token::UnicodeEscape(UnicodeEscapeToken {
- // Allow more than `ascii_hexdigit` for better error recovery.
- sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
- terminated: self.s.eat_if('}'),
- })
- }
- c if c.is_whitespace() => Token::Backslash,
- _ => Token::Text("\\"),
- }
- } else {
- Token::Backslash
- }
- }
-
fn ident(&mut self, start: usize) -> Token<'s> {
self.s.eat_while(is_id_continue);
match self.s.eaten_from(start) {
@@ -474,6 +492,10 @@ mod tests {
use Token::{Ident, *};
use TokenMode::{Code, Markup};
+ const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token {
+ Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated })
+ }
+
const fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
Token::Raw(RawToken { text, backticks, terminated })
}
@@ -482,18 +504,14 @@ mod tests {
Token::Math(MathToken { formula, display, terminated })
}
- const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token {
- Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated })
+ const fn Color(r: u8, g: u8, b: u8, a: u8) -> Token<'static> {
+ Token::Color(RgbaColor { r, g, b, a })
}
const fn Str(string: &str, terminated: bool) -> Token {
Token::Str(StrToken { string, terminated })
}
- const fn Color(r: u8, g: u8, b: u8, a: u8) -> Token<'static> {
- Token::Color(RgbaColor { r, g, b, a })
- }
-
/// Building blocks for suffix testing.
///
/// We extend each test case with a collection of different suffixes to make
@@ -606,14 +624,91 @@ mod tests {
}
#[test]
+ fn test_tokenize_whitespace() {
+ // Test basic whitespace.
+ t!(Both["a1/"]: "" => );
+ t!(Both["a1/"]: " " => Space(0));
+ t!(Both["a1/"]: " " => Space(0));
+ t!(Both["a1/"]: "\t" => Space(0));
+ t!(Both["a1/"]: " \t" => Space(0));
+ t!(Both["a1/"]: "\u{202F}" => Space(0));
+
+ // Test newline counting.
+ t!(Both["a1/"]: "\n" => Space(1));
+ t!(Both["a1/"]: "\n " => Space(1));
+ t!(Both["a1/"]: " \n" => Space(1));
+ t!(Both["a1/"]: " \n " => Space(1));
+ t!(Both["a1/"]: "\r\n" => Space(1));
+ t!(Both["a1/"]: " \n\t \n " => Space(2));
+ t!(Both["a1/"]: "\n\r" => Space(2));
+ t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
+ }
+
+ #[test]
+ fn test_tokenize_text() {
+ // Test basic text.
+ t!(Markup[" /"]: "hello" => Text("hello"));
+ t!(Markup[" /"]: "hello-world" => Text("hello"), Text("-"), Text("world"));
+
+ // Test code symbols in text.
+ t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
+ t!(Markup[" /"]: ";:,|/+" => Text(";:,|/+"));
+ t!(Markup[" /"]: "#-a" => Text("#"), Text("-"), Text("a"));
+ t!(Markup[" "]: "#123" => Text("#"), Text("123"));
+
+ // Test text ends.
+ t!(Markup[""]: "hello " => Text("hello"), Space(0));
+ t!(Markup[""]: "hello~" => Text("hello"), Tilde);
+ }
+
+ #[test]
+ fn test_tokenize_escape_sequences() {
+ // Test escapable symbols.
+ t!(Markup: r"\\" => Text(r"\"));
+ t!(Markup: r"\/" => Text("/"));
+ t!(Markup: r"\[" => Text("["));
+ t!(Markup: r"\]" => Text("]"));
+ t!(Markup: r"\{" => Text("{"));
+ t!(Markup: r"\}" => Text("}"));
+ t!(Markup: r"\*" => Text("*"));
+ t!(Markup: r"\_" => Text("_"));
+ t!(Markup: r"\=" => Text("="));
+ t!(Markup: r"\~" => Text("~"));
+ t!(Markup: r"\`" => Text("`"));
+ t!(Markup: r"\$" => Text("$"));
+ t!(Markup: r"\#" => Text("#"));
+
+ // Test unescapable symbols.
+ t!(Markup[" /"]: r"\a" => Text(r"\"), Text("a"));
+ t!(Markup[" /"]: r"\u" => Text(r"\"), Text("u"));
+ t!(Markup[" /"]: r"\1" => Text(r"\"), Text("1"));
+ t!(Markup[" /"]: r"\:" => Text(r"\"), Text(":"));
+ t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
+
+ // Test basic unicode escapes.
+ t!(Markup: r"\u{}" => UnicodeEscape("", true));
+ t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
+ t!(Markup: r"\u{P}" => UnicodeEscape("P", true));
+
+ // Test unclosed unicode escapes.
+ t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false));
+ t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false));
+ t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
+ t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
+ t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace);
+ }
+
+ #[test]
fn test_tokenize_markup_symbols() {
// Test markup tokens.
- t!(Markup[" a1"]: "*" => Star);
- t!(Markup: "_" => Underscore);
- t!(Markup[""]: "###" => Hashtag, Hashtag, Hashtag);
- t!(Markup["a1/"]: "# " => Hashtag, Space(0));
- t!(Markup: "~" => Tilde);
- t!(Markup[" "]: r"\" => Backslash);
+ t!(Markup[" a1"]: "*" => Star);
+ t!(Markup: "_" => Underscore);
+ t!(Markup[""]: "###" => Hashtag, Hashtag, Hashtag);
+ t!(Markup["a1/"]: "# " => Hashtag, Space(0));
+ t!(Markup["a1/"]: "- " => Hyph, Space(0));
+ t!(Markup: "~" => Tilde);
+ t!(Markup[" "]: r"\" => Backslash);
+ t!(Markup["a "]: r"a--" => Text("a"), HyphHyph);
}
#[test]
@@ -654,71 +749,32 @@ mod tests {
#[test]
fn test_tokenize_keywords() {
- let keywords = [
+ // A list of a few (not all) keywords.
+ let list = [
("let", Let),
("if", If),
("else", Else),
("for", For),
("in", In),
- ("while", While),
- ("break", Break),
- ("continue", Continue),
- ("return", Return),
+ ("import", Import),
];
- for &(s, t) in &keywords {
+ for &(s, t) in &list {
t!(Markup[" "]: format!("#{}", s) => t);
t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
t!(Markup[" /"]: format!("# {}", s) => Token::Hashtag, Space(0), Text(s));
}
- for &(s, t) in &keywords {
+ for &(s, t) in &list {
t!(Code[" "]: s => t);
t!(Markup[" /"]: s => Text(s));
}
// Test simple identifier.
t!(Markup[" "]: "#letter" => Ident("letter"));
- t!(Markup[" "]: "#123" => Invalid("#123"));
- t!(Code[" /"]: "falser" => Ident("falser"));
- t!(Code[" /"]: "None" => Ident("None"));
- t!(Code[" /"]: "True" => Ident("True"));
- }
-
- #[test]
- fn test_tokenize_whitespace() {
- // Test basic whitespace.
- t!(Both["a1/"]: "" => );
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: "\t" => Space(0));
- t!(Both["a1/"]: " \t" => Space(0));
- t!(Both["a1/"]: "\u{202F}" => Space(0));
-
- // Test newline counting.
- t!(Both["a1/"]: "\n" => Space(1));
- t!(Both["a1/"]: "\n " => Space(1));
- t!(Both["a1/"]: " \n" => Space(1));
- t!(Both["a1/"]: " \n " => Space(1));
- t!(Both["a1/"]: "\r\n" => Space(1));
- t!(Both["a1/"]: " \n\t \n " => Space(2));
- t!(Both["a1/"]: "\n\r" => Space(2));
- t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
- }
-
- #[test]
- fn test_tokenize_text() {
- // Test basic text.
- t!(Markup[" /"]: "hello" => Text("hello"));
- t!(Markup[" /"]: "hello-world" => Text("hello-world"));
-
- // Test code symbols in text.
- t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
- t!(Markup[" /"]: ";:,|/+-" => Text(";:,|/+-"));
-
- // Test text ends.
- t!(Markup[""]: "hello " => Text("hello"), Space(0));
- t!(Markup[""]: "hello~" => Text("hello"), Tilde);
+ t!(Code[" /"]: "falser" => Ident("falser"));
+ t!(Code[" /"]: "None" => Ident("None"));
+ t!(Code[" /"]: "True" => Ident("True"));
}
#[test]
@@ -765,43 +821,6 @@ mod tests {
}
#[test]
- fn test_tokenize_escape_sequences() {
- // Test escapable symbols.
- t!(Markup: r"\\" => Text(r"\"));
- t!(Markup: r"\/" => Text("/"));
- t!(Markup: r"\[" => Text("["));
- t!(Markup: r"\]" => Text("]"));
- t!(Markup: r"\{" => Text("{"));
- t!(Markup: r"\}" => Text("}"));
- t!(Markup: r"\*" => Text("*"));
- t!(Markup: r"\_" => Text("_"));
- t!(Markup: r"\=" => Text("="));
- t!(Markup: r"\~" => Text("~"));
- t!(Markup: r"\`" => Text("`"));
- t!(Markup: r"\$" => Text("$"));
- t!(Markup: r"\#" => Text("#"));
-
- // Test unescapable symbols.
- t!(Markup[" /"]: r"\a" => Text(r"\"), Text("a"));
- t!(Markup[" /"]: r"\u" => Text(r"\"), Text("u"));
- t!(Markup[" /"]: r"\1" => Text(r"\"), Text("1"));
- t!(Markup[" /"]: r"\:" => Text(r"\"), Text(":"));
- t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
-
- // Test basic unicode escapes.
- t!(Markup: r"\u{}" => UnicodeEscape("", true));
- t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
- t!(Markup: r"\u{P}" => UnicodeEscape("P", true));
-
- // Test unclosed unicode escapes.
- t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false));
- t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false));
- t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
- t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
- t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace);
- }
-
- #[test]
fn test_tokenize_idents() {
// Test valid identifiers.
t!(Code[" /"]: "x" => Ident("x"));
@@ -956,8 +975,7 @@ mod tests {
t!(Code: "1p%" => Invalid("1p"), Invalid("%"));
t!(Code: "1%%" => Percent(1.0), Invalid("%"));
- // Test invalid keyword.
- t!(Markup[" /"]: "#-" => Invalid("#-"));
+ // Test invalid color.
t!(Code[" /"]: r"#letter" => Invalid(r"#letter"));
}
}