summaryrefslogtreecommitdiff
path: root/src/parse/tokens.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2021-06-09 00:37:13 +0200
committerLaurenz <laurmaedje@gmail.com>2021-06-09 00:37:13 +0200
commit5afb42ad89abb518a01a09051f0f9b6f75bd383e (patch)
treeb12368a287f22de711df8d759c20ee742ed5b4c2 /src/parse/tokens.rs
parentd69dfa84ec957ac4037f60a3335416a9f73b97c8 (diff)
Lists with indent-based parsing
- Unordered lists with indent-based parsing and basic layout using stacks - Headings are now also indent based - Removes syntax functions since they will be superseded by select & transform
Diffstat (limited to 'src/parse/tokens.rs')
-rw-r--r--src/parse/tokens.rs508
1 files changed, 263 insertions, 245 deletions
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index fa86d2f1..74051801 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -38,20 +38,22 @@ impl<'s> Tokens<'s> {
self.mode = mode;
}
- /// The position in the string at which the last token ends and next token
+ /// The index in the string at which the last token ends and next token
/// will start.
- pub fn pos(&self) -> Pos {
- self.s.index().into()
+ pub fn index(&self) -> usize {
+ self.s.index()
}
- /// Jump to the given position.
- pub fn jump(&mut self, pos: Pos) {
- self.s.jump(pos.to_usize());
+ /// Jump to the given index in the string.
+ ///
+ /// You need to know the correct column.
+ pub fn jump(&mut self, index: usize) {
+ self.s.jump(index);
}
/// The underlying scanner.
- pub fn scanner(&self) -> &Scanner<'s> {
- &self.s
+ pub fn scanner(&self) -> Scanner<'s> {
+ self.s
}
}
@@ -62,126 +64,100 @@ impl<'s> Iterator for Tokens<'s> {
fn next(&mut self) -> Option<Self::Item> {
let start = self.s.index();
let c = self.s.eat()?;
-
- // This never loops. It just exists to allow breaking out of it.
- loop {
- // Common elements.
- return Some(match c {
- // Blocks and templates.
- '[' => Token::LeftBracket,
- ']' => Token::RightBracket,
- '{' => Token::LeftBrace,
- '}' => Token::RightBrace,
-
- // Headings, keywords, identifiers, colors.
- '#' => self.hash(start),
-
- // Whitespace.
- c if c.is_whitespace() => self.whitespace(c),
-
- // Comments.
- '/' if self.s.eat_if('/') => self.line_comment(),
- '/' if self.s.eat_if('*') => self.block_comment(),
- '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)),
-
- _ => break,
- });
- }
-
- Some(match self.mode {
- TokenMode::Markup => match c {
- // Markup.
- '*' => Token::Star,
- '_' => Token::Underscore,
- '~' => Token::Tilde,
- '`' => self.raw(),
- '$' => self.math(),
- '\\' => self.backslash(),
-
- // Plain text.
- _ => self.text(start),
- },
-
- TokenMode::Code => match c {
- // Parens.
- '(' => Token::LeftParen,
- ')' => Token::RightParen,
-
- // Length two.
- '=' if self.s.eat_if('=') => Token::EqEq,
- '!' if self.s.eat_if('=') => Token::BangEq,
- '<' if self.s.eat_if('=') => Token::LtEq,
- '>' if self.s.eat_if('=') => Token::GtEq,
- '+' if self.s.eat_if('=') => Token::PlusEq,
- '-' if self.s.eat_if('=') => Token::HyphEq,
- '*' if self.s.eat_if('=') => Token::StarEq,
- '/' if self.s.eat_if('=') => Token::SlashEq,
- '.' if self.s.eat_if('.') => Token::Dots,
- '=' if self.s.eat_if('>') => Token::Arrow,
-
- // Length one.
- ',' => Token::Comma,
- ';' => Token::Semicolon,
- ':' => Token::Colon,
- '+' => Token::Plus,
- '-' => Token::Hyph,
- '*' => Token::Star,
- '/' => Token::Slash,
- '=' => Token::Eq,
- '<' => Token::Lt,
- '>' => Token::Gt,
-
- // Identifiers.
- c if is_id_start(c) => self.ident(start),
-
- // Numbers.
- c if c.is_ascii_digit()
- || (c == '.' && self.s.check(|n| n.is_ascii_digit())) =>
- {
- self.number(start, c)
- }
-
- // Strings.
- '"' => self.string(),
-
- _ => Token::Invalid(self.s.eaten_from(start)),
+ Some(match c {
+ // Blocks and templates.
+ '[' => Token::LeftBracket,
+ ']' => Token::RightBracket,
+ '{' => Token::LeftBrace,
+ '}' => Token::RightBrace,
+
+ // Headings, keywords, identifiers, colors.
+ '#' => self.hash(start),
+
+ // Whitespace.
+ c if c.is_whitespace() => self.whitespace(c),
+
+ // Comments.
+ '/' if self.s.eat_if('/') => self.line_comment(),
+ '/' if self.s.eat_if('*') => self.block_comment(),
+ '*' if self.s.eat_if('/') => Token::Invalid(self.s.eaten_from(start)),
+
+ // Other things.
+ _ => match self.mode {
+ TokenMode::Markup => self.markup(start, c),
+ TokenMode::Code => self.code(start, c),
},
})
}
}
impl<'s> Tokens<'s> {
- fn hash(&mut self, start: usize) -> Token<'s> {
- let read = self.s.eat_while(is_id_continue);
-
- match self.mode {
- TokenMode::Markup => {
- if read.is_empty() {
- return Token::Hashtag;
- }
-
- if let Some(token) = keyword(read) {
- return token;
- }
+ fn markup(&mut self, start: usize, c: char) -> Token<'s> {
+ match c {
+ // Markup.
+ '~' => Token::Tilde,
+ '*' => Token::Star,
+ '_' => Token::Underscore,
+ '\\' => self.backslash(),
+ '`' => self.raw(),
+ '$' => self.math(),
+ '-' => self.hyph(start),
+
+ // Plain text.
+ _ => self.text(start),
+ }
+ }
- if read.chars().next().map_or(false, is_id_start) {
- return Token::Ident(read);
- }
+ fn code(&mut self, start: usize, c: char) -> Token<'s> {
+ match c {
+ // Parens.
+ '(' => Token::LeftParen,
+ ')' => Token::RightParen,
+
+ // Length two.
+ '=' if self.s.eat_if('=') => Token::EqEq,
+ '!' if self.s.eat_if('=') => Token::BangEq,
+ '<' if self.s.eat_if('=') => Token::LtEq,
+ '>' if self.s.eat_if('=') => Token::GtEq,
+ '+' if self.s.eat_if('=') => Token::PlusEq,
+ '-' if self.s.eat_if('=') => Token::HyphEq,
+ '*' if self.s.eat_if('=') => Token::StarEq,
+ '/' if self.s.eat_if('=') => Token::SlashEq,
+ '.' if self.s.eat_if('.') => Token::Dots,
+ '=' if self.s.eat_if('>') => Token::Arrow,
+
+ // Length one.
+ ',' => Token::Comma,
+ ';' => Token::Semicolon,
+ ':' => Token::Colon,
+ '+' => Token::Plus,
+ '-' => Token::Hyph,
+ '*' => Token::Star,
+ '/' => Token::Slash,
+ '=' => Token::Eq,
+ '<' => Token::Lt,
+ '>' => Token::Gt,
+
+ // Identifiers.
+ c if is_id_start(c) => self.ident(start),
+
+ // Numbers.
+ c if c.is_ascii_digit()
+ || (c == '.' && self.s.check(|n| n.is_ascii_digit())) =>
+ {
+ self.number(start, c)
}
- TokenMode::Code => {
- if let Ok(color) = RgbaColor::from_str(read) {
- return Token::Color(color);
- }
- }
- }
+ // Strings.
+ '"' => self.string(),
- Token::Invalid(self.s.eaten_from(start))
+ _ => Token::Invalid(self.s.eaten_from(start)),
+ }
}
fn whitespace(&mut self, first: char) -> Token<'s> {
// Fast path for just a single space
- if first == ' ' && !self.s.check(|c| c.is_whitespace()) {
+ if first == ' ' && !self.s.check(char::is_whitespace) {
Token::Space(0)
} else {
self.s.uneat();
@@ -210,12 +186,13 @@ impl<'s> Tokens<'s> {
c if c.is_whitespace() => true,
// Comments.
'/' if self.s.check(|c| c == '/' || c == '*') => true,
- // Parenthesis and hashtag.
- '[' | ']' | '{' | '}' | '#' => true,
+ // Parentheses.
+ '[' | ']' | '{' | '}' => true,
// Markup.
- '*' | '_' | '=' | '~' | '`' | '$' => true,
+ '#' | '~' | '*' | '_' | '-' | '`' | '$' => true,
// Escaping.
'\\' => true,
+ // Just text.
_ => false,
} {
self.s.uneat();
@@ -226,6 +203,77 @@ impl<'s> Tokens<'s> {
Token::Text(self.s.eaten_from(start))
}
+ fn backslash(&mut self) -> Token<'s> {
+ if let Some(c) = self.s.peek() {
+ match c {
+ // Backslash and comments.
+ '\\' | '/' |
+ // Parenthesis and hashtag.
+ '[' | ']' | '{' | '}' | '#' |
+ // Markup.
+ '*' | '_' | '=' | '~' | '`' | '$' => {
+ let start = self.s.index();
+ self.s.eat_assert(c);
+ Token::Text(&self.s.eaten_from(start))
+ }
+ 'u' if self.s.peek_nth(1) == Some('{') => {
+ self.s.eat_assert('u');
+ self.s.eat_assert('{');
+ Token::UnicodeEscape(UnicodeEscapeToken {
+ // Allow more than `ascii_hexdigit` for better error recovery.
+ sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
+ terminated: self.s.eat_if('}'),
+ })
+ }
+ c if c.is_whitespace() => Token::Backslash,
+ _ => Token::Text("\\"),
+ }
+ } else {
+ Token::Backslash
+ }
+ }
+
+ fn hash(&mut self, start: usize) -> Token<'s> {
+ match self.mode {
+ TokenMode::Markup => {
+ if self.s.check(is_id_start) {
+ let read = self.s.eat_while(is_id_continue);
+ if let Some(keyword) = keyword(read) {
+ keyword
+ } else {
+ Token::Ident(read)
+ }
+ } else if self.s.check(|c| c != '#' && !c.is_whitespace()) {
+ Token::Text(self.s.eaten_from(start))
+ } else {
+ Token::Hashtag
+ }
+ }
+ TokenMode::Code => {
+ let read = self.s.eat_while(is_id_continue);
+ if let Ok(color) = RgbaColor::from_str(read) {
+ Token::Color(color)
+ } else {
+ Token::Invalid(self.s.eaten_from(start))
+ }
+ }
+ }
+ }
+
+ fn hyph(&mut self, start: usize) -> Token<'s> {
+ if self.s.eat_if('-') {
+ if self.s.eat_if('-') {
+ Token::HyphHyphHyph
+ } else {
+ Token::HyphHyph
+ }
+ } else if self.s.check(|c| !c.is_whitespace()) {
+ Token::Text(self.s.eaten_from(start))
+ } else {
+ Token::Hyph
+ }
+ }
+
fn raw(&mut self) -> Token<'s> {
let mut backticks = 1;
while self.s.eat_if('`') {
@@ -295,36 +343,6 @@ impl<'s> Tokens<'s> {
})
}
- fn backslash(&mut self) -> Token<'s> {
- if let Some(c) = self.s.peek() {
- match c {
- // Backslash and comments.
- '\\' | '/' |
- // Parenthesis and hashtag.
- '[' | ']' | '{' | '}' | '#' |
- // Markup.
- '*' | '_' | '=' | '~' | '`' | '$' => {
- let start = self.s.index();
- self.s.eat_assert(c);
- Token::Text(&self.s.eaten_from(start))
- }
- 'u' if self.s.peek_nth(1) == Some('{') => {
- self.s.eat_assert('u');
- self.s.eat_assert('{');
- Token::UnicodeEscape(UnicodeEscapeToken {
- // Allow more than `ascii_hexdigit` for better error recovery.
- sequence: self.s.eat_while(|c| c.is_ascii_alphanumeric()),
- terminated: self.s.eat_if('}'),
- })
- }
- c if c.is_whitespace() => Token::Backslash,
- _ => Token::Text("\\"),
- }
- } else {
- Token::Backslash
- }
- }
-
fn ident(&mut self, start: usize) -> Token<'s> {
self.s.eat_while(is_id_continue);
match self.s.eaten_from(start) {
@@ -474,6 +492,10 @@ mod tests {
use Token::{Ident, *};
use TokenMode::{Code, Markup};
+ const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token {
+ Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated })
+ }
+
const fn Raw(text: &str, backticks: usize, terminated: bool) -> Token {
Token::Raw(RawToken { text, backticks, terminated })
}
@@ -482,18 +504,14 @@ mod tests {
Token::Math(MathToken { formula, display, terminated })
}
- const fn UnicodeEscape(sequence: &str, terminated: bool) -> Token {
- Token::UnicodeEscape(UnicodeEscapeToken { sequence, terminated })
+ const fn Color(r: u8, g: u8, b: u8, a: u8) -> Token<'static> {
+ Token::Color(RgbaColor { r, g, b, a })
}
const fn Str(string: &str, terminated: bool) -> Token {
Token::Str(StrToken { string, terminated })
}
- const fn Color(r: u8, g: u8, b: u8, a: u8) -> Token<'static> {
- Token::Color(RgbaColor { r, g, b, a })
- }
-
/// Building blocks for suffix testing.
///
/// We extend each test case with a collection of different suffixes to make
@@ -606,14 +624,91 @@ mod tests {
}
#[test]
+ fn test_tokenize_whitespace() {
+ // Test basic whitespace.
+ t!(Both["a1/"]: "" => );
+ t!(Both["a1/"]: " " => Space(0));
+ t!(Both["a1/"]: " " => Space(0));
+ t!(Both["a1/"]: "\t" => Space(0));
+ t!(Both["a1/"]: " \t" => Space(0));
+ t!(Both["a1/"]: "\u{202F}" => Space(0));
+
+ // Test newline counting.
+ t!(Both["a1/"]: "\n" => Space(1));
+ t!(Both["a1/"]: "\n " => Space(1));
+ t!(Both["a1/"]: " \n" => Space(1));
+ t!(Both["a1/"]: " \n " => Space(1));
+ t!(Both["a1/"]: "\r\n" => Space(1));
+ t!(Both["a1/"]: " \n\t \n " => Space(2));
+ t!(Both["a1/"]: "\n\r" => Space(2));
+ t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
+ }
+
+ #[test]
+ fn test_tokenize_text() {
+ // Test basic text.
+ t!(Markup[" /"]: "hello" => Text("hello"));
+ t!(Markup[" /"]: "hello-world" => Text("hello"), Text("-"), Text("world"));
+
+ // Test code symbols in text.
+ t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
+ t!(Markup[" /"]: ";:,|/+" => Text(";:,|/+"));
+ t!(Markup[" /"]: "#-a" => Text("#"), Text("-"), Text("a"));
+ t!(Markup[" "]: "#123" => Text("#"), Text("123"));
+
+ // Test text ends.
+ t!(Markup[""]: "hello " => Text("hello"), Space(0));
+ t!(Markup[""]: "hello~" => Text("hello"), Tilde);
+ }
+
+ #[test]
+ fn test_tokenize_escape_sequences() {
+ // Test escapable symbols.
+ t!(Markup: r"\\" => Text(r"\"));
+ t!(Markup: r"\/" => Text("/"));
+ t!(Markup: r"\[" => Text("["));
+ t!(Markup: r"\]" => Text("]"));
+ t!(Markup: r"\{" => Text("{"));
+ t!(Markup: r"\}" => Text("}"));
+ t!(Markup: r"\*" => Text("*"));
+ t!(Markup: r"\_" => Text("_"));
+ t!(Markup: r"\=" => Text("="));
+ t!(Markup: r"\~" => Text("~"));
+ t!(Markup: r"\`" => Text("`"));
+ t!(Markup: r"\$" => Text("$"));
+ t!(Markup: r"\#" => Text("#"));
+
+ // Test unescapable symbols.
+ t!(Markup[" /"]: r"\a" => Text(r"\"), Text("a"));
+ t!(Markup[" /"]: r"\u" => Text(r"\"), Text("u"));
+ t!(Markup[" /"]: r"\1" => Text(r"\"), Text("1"));
+ t!(Markup[" /"]: r"\:" => Text(r"\"), Text(":"));
+ t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
+
+ // Test basic unicode escapes.
+ t!(Markup: r"\u{}" => UnicodeEscape("", true));
+ t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
+ t!(Markup: r"\u{P}" => UnicodeEscape("P", true));
+
+ // Test unclosed unicode escapes.
+ t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false));
+ t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false));
+ t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
+ t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
+ t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace);
+ }
+
+ #[test]
fn test_tokenize_markup_symbols() {
// Test markup tokens.
- t!(Markup[" a1"]: "*" => Star);
- t!(Markup: "_" => Underscore);
- t!(Markup[""]: "###" => Hashtag, Hashtag, Hashtag);
- t!(Markup["a1/"]: "# " => Hashtag, Space(0));
- t!(Markup: "~" => Tilde);
- t!(Markup[" "]: r"\" => Backslash);
+ t!(Markup[" a1"]: "*" => Star);
+ t!(Markup: "_" => Underscore);
+ t!(Markup[""]: "###" => Hashtag, Hashtag, Hashtag);
+ t!(Markup["a1/"]: "# " => Hashtag, Space(0));
+ t!(Markup["a1/"]: "- " => Hyph, Space(0));
+ t!(Markup: "~" => Tilde);
+ t!(Markup[" "]: r"\" => Backslash);
+ t!(Markup["a "]: r"a--" => Text("a"), HyphHyph);
}
#[test]
@@ -654,71 +749,32 @@ mod tests {
#[test]
fn test_tokenize_keywords() {
- let keywords = [
+ // A list of a few (not all) keywords.
+ let list = [
("let", Let),
("if", If),
("else", Else),
("for", For),
("in", In),
- ("while", While),
- ("break", Break),
- ("continue", Continue),
- ("return", Return),
+ ("import", Import),
];
- for &(s, t) in &keywords {
+ for &(s, t) in &list {
t!(Markup[" "]: format!("#{}", s) => t);
t!(Markup[" "]: format!("#{0}#{0}", s) => t, t);
t!(Markup[" /"]: format!("# {}", s) => Token::Hashtag, Space(0), Text(s));
}
- for &(s, t) in &keywords {
+ for &(s, t) in &list {
t!(Code[" "]: s => t);
t!(Markup[" /"]: s => Text(s));
}
// Test simple identifier.
t!(Markup[" "]: "#letter" => Ident("letter"));
- t!(Markup[" "]: "#123" => Invalid("#123"));
- t!(Code[" /"]: "falser" => Ident("falser"));
- t!(Code[" /"]: "None" => Ident("None"));
- t!(Code[" /"]: "True" => Ident("True"));
- }
-
- #[test]
- fn test_tokenize_whitespace() {
- // Test basic whitespace.
- t!(Both["a1/"]: "" => );
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: " " => Space(0));
- t!(Both["a1/"]: "\t" => Space(0));
- t!(Both["a1/"]: " \t" => Space(0));
- t!(Both["a1/"]: "\u{202F}" => Space(0));
-
- // Test newline counting.
- t!(Both["a1/"]: "\n" => Space(1));
- t!(Both["a1/"]: "\n " => Space(1));
- t!(Both["a1/"]: " \n" => Space(1));
- t!(Both["a1/"]: " \n " => Space(1));
- t!(Both["a1/"]: "\r\n" => Space(1));
- t!(Both["a1/"]: " \n\t \n " => Space(2));
- t!(Both["a1/"]: "\n\r" => Space(2));
- t!(Both["a1/"]: " \r\r\n \x0D" => Space(3));
- }
-
- #[test]
- fn test_tokenize_text() {
- // Test basic text.
- t!(Markup[" /"]: "hello" => Text("hello"));
- t!(Markup[" /"]: "hello-world" => Text("hello-world"));
-
- // Test code symbols in text.
- t!(Markup[" /"]: "a():\"b" => Text("a():\"b"));
- t!(Markup[" /"]: ";:,|/+-" => Text(";:,|/+-"));
-
- // Test text ends.
- t!(Markup[""]: "hello " => Text("hello"), Space(0));
- t!(Markup[""]: "hello~" => Text("hello"), Tilde);
+ t!(Code[" /"]: "falser" => Ident("falser"));
+ t!(Code[" /"]: "None" => Ident("None"));
+ t!(Code[" /"]: "True" => Ident("True"));
}
#[test]
@@ -765,43 +821,6 @@ mod tests {
}
#[test]
- fn test_tokenize_escape_sequences() {
- // Test escapable symbols.
- t!(Markup: r"\\" => Text(r"\"));
- t!(Markup: r"\/" => Text("/"));
- t!(Markup: r"\[" => Text("["));
- t!(Markup: r"\]" => Text("]"));
- t!(Markup: r"\{" => Text("{"));
- t!(Markup: r"\}" => Text("}"));
- t!(Markup: r"\*" => Text("*"));
- t!(Markup: r"\_" => Text("_"));
- t!(Markup: r"\=" => Text("="));
- t!(Markup: r"\~" => Text("~"));
- t!(Markup: r"\`" => Text("`"));
- t!(Markup: r"\$" => Text("$"));
- t!(Markup: r"\#" => Text("#"));
-
- // Test unescapable symbols.
- t!(Markup[" /"]: r"\a" => Text(r"\"), Text("a"));
- t!(Markup[" /"]: r"\u" => Text(r"\"), Text("u"));
- t!(Markup[" /"]: r"\1" => Text(r"\"), Text("1"));
- t!(Markup[" /"]: r"\:" => Text(r"\"), Text(":"));
- t!(Markup[" /"]: r#"\""# => Text(r"\"), Text("\""));
-
- // Test basic unicode escapes.
- t!(Markup: r"\u{}" => UnicodeEscape("", true));
- t!(Markup: r"\u{2603}" => UnicodeEscape("2603", true));
- t!(Markup: r"\u{P}" => UnicodeEscape("P", true));
-
- // Test unclosed unicode escapes.
- t!(Markup[" /"]: r"\u{" => UnicodeEscape("", false));
- t!(Markup[" /"]: r"\u{1" => UnicodeEscape("1", false));
- t!(Markup[" /"]: r"\u{26A4" => UnicodeEscape("26A4", false));
- t!(Markup[" /"]: r"\u{1Q3P" => UnicodeEscape("1Q3P", false));
- t!(Markup: r"\u{1🏕}" => UnicodeEscape("1", false), Text("🏕"), RightBrace);
- }
-
- #[test]
fn test_tokenize_idents() {
// Test valid identifiers.
t!(Code[" /"]: "x" => Ident("x"));
@@ -956,8 +975,7 @@ mod tests {
t!(Code: "1p%" => Invalid("1p"), Invalid("%"));
t!(Code: "1%%" => Percent(1.0), Invalid("%"));
- // Test invalid keyword.
- t!(Markup[" /"]: "#-" => Invalid("#-"));
+ // Test invalid color.
t!(Code[" /"]: r"#letter" => Invalid(r"#letter"));
}
}