summaryrefslogtreecommitdiff
path: root/src/syntax/lexer.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-01-20 14:05:17 +0100
committerLaurenz <laurmaedje@gmail.com>2023-01-20 14:05:35 +0100
commitdd331f007cb9c9968605f8d3eaef8fb498c21322 (patch)
treef1b1490758ec53fd204724a325158d16c980d131 /src/syntax/lexer.rs
parent40561e57fbbc68becac07acd54a34f94f591f277 (diff)
Rewrite parser
Diffstat (limited to 'src/syntax/lexer.rs')
-rw-r--r--src/syntax/lexer.rs344
1 files changed, 156 insertions, 188 deletions
diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs
index f082bd28..e3c29150 100644
--- a/src/syntax/lexer.rs
+++ b/src/syntax/lexer.rs
@@ -9,12 +9,11 @@ use crate::util::{format_eco, EcoString};
pub(super) struct Lexer<'s> {
/// The underlying scanner.
s: Scanner<'s>,
- /// The mode the lexer is in. This determines what tokens it recognizes.
+ /// The mode the lexer is in. This determines which kinds of tokens it
+ /// produces.
mode: LexMode,
- /// Whether the last token has been terminated.
- terminated: bool,
- /// Offsets the indentation on the first line of the source.
- column_offset: usize,
+ /// Whether the last token contained a newline.
+ newline: bool,
/// An error for the last token.
error: Option<(EcoString, ErrorPos)>,
}
@@ -33,12 +32,11 @@ pub(super) enum LexMode {
impl<'s> Lexer<'s> {
/// Create a new lexer with the given mode and a prefix to offset column
/// calculations.
- pub fn with_prefix(prefix: &str, text: &'s str, mode: LexMode) -> Self {
+ pub fn new(text: &'s str, mode: LexMode) -> Self {
Self {
s: Scanner::new(text),
mode,
- terminated: true,
- column_offset: column(prefix, prefix.len(), 0),
+ newline: false,
error: None,
}
}
@@ -64,26 +62,18 @@ impl<'s> Lexer<'s> {
self.s.jump(index);
}
- /// The underlying scanner.
- pub fn scanner(&self) -> Scanner<'s> {
- self.s
- }
-
- /// Whether the last token was terminated.
- pub fn terminated(&self) -> bool {
- self.terminated
+ /// Whether the last token contained a newline.
+ pub fn newline(&self) -> bool {
+ self.newline
}
- /// The column index of a given index in the source string.
- pub fn column(&self, index: usize) -> usize {
- column(self.s.string(), index, self.column_offset)
- }
-
- /// Take out the last error.
- pub fn last_error(&mut self) -> Option<(EcoString, ErrorPos)> {
+ /// Take out the last error, if any.
+ pub fn take_error(&mut self) -> Option<(EcoString, ErrorPos)> {
self.error.take()
}
+}
+impl Lexer<'_> {
/// Construct a full-positioned syntax error.
fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
self.error = Some((message.into(), ErrorPos::Full));
@@ -97,45 +87,53 @@ impl<'s> Lexer<'s> {
}
}
-impl Iterator for Lexer<'_> {
- type Item = SyntaxKind;
-
- /// Produce the next token.
- fn next(&mut self) -> Option<Self::Item> {
+/// Shared.
+impl Lexer<'_> {
+ pub fn next(&mut self) -> SyntaxKind {
+ self.newline = false;
self.error = None;
let start = self.s.cursor();
- let c = self.s.eat()?;
- Some(match c {
- // Trivia.
- c if c.is_whitespace() => self.whitespace(c),
- '/' if self.s.eat_if('/') => self.line_comment(),
- '/' if self.s.eat_if('*') => self.block_comment(),
- '*' if self.s.eat_if('/') => self.error("unexpected end of block comment"),
-
- // Other things.
- _ => match self.mode {
+ match self.s.eat() {
+ Some(c) if c.is_whitespace() => self.whitespace(start, c),
+ Some('/') if self.s.eat_if('/') => self.line_comment(),
+ Some('/') if self.s.eat_if('*') => self.block_comment(),
+ Some('*') if self.s.eat_if('/') => {
+ self.error("unexpected end of block comment")
+ }
+
+ Some(c) => match self.mode {
LexMode::Markup => self.markup(start, c),
LexMode::Math => self.math(c),
LexMode::Code => self.code(start, c),
},
- })
+
+ None => SyntaxKind::Eof,
+ }
+ }
+
+ fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
+ let more = self.s.eat_while(char::is_whitespace);
+ let newlines = match c {
+ ' ' if more.is_empty() => 0,
+ _ => count_newlines(self.s.from(start)),
+ };
+
+ self.newline = newlines > 0;
+ if self.mode == LexMode::Markup && newlines >= 2 {
+ SyntaxKind::Parbreak
+ } else {
+ SyntaxKind::Space
+ }
}
-}
-/// Shared.
-impl Lexer<'_> {
fn line_comment(&mut self) -> SyntaxKind {
self.s.eat_until(is_newline);
- if self.s.done() {
- self.terminated = false;
- }
SyntaxKind::LineComment
}
fn block_comment(&mut self) -> SyntaxKind {
let mut state = '_';
let mut depth = 1;
- self.terminated = false;
// Find the first `*/` that does not correspond to a nested `/*`.
while let Some(c) = self.s.eat() {
@@ -143,7 +141,6 @@ impl Lexer<'_> {
('*', '/') => {
depth -= 1;
if depth == 0 {
- self.terminated = true;
break;
}
'_'
@@ -162,32 +159,6 @@ impl Lexer<'_> {
SyntaxKind::BlockComment
}
-
- fn whitespace(&mut self, c: char) -> SyntaxKind {
- if c == ' ' && !self.s.at(char::is_whitespace) {
- return SyntaxKind::Space { newlines: 0 };
- }
-
- self.s.uneat();
-
- // Count the number of newlines.
- let mut newlines = 0;
- while let Some(c) = self.s.eat() {
- if !c.is_whitespace() {
- self.s.uneat();
- break;
- }
-
- if is_newline(c) {
- if c == '\r' {
- self.s.eat_if('\n');
- }
- newlines += 1;
- }
- }
-
- SyntaxKind::Space { newlines }
- }
}
/// Markup.
@@ -199,9 +170,9 @@ impl Lexer<'_> {
'`' => self.raw(),
'h' if self.s.eat_if("ttp://") => self.link(),
'h' if self.s.eat_if("ttps://") => self.link(),
+ '0'..='9' => self.numbering(start),
'<' if self.s.at(is_id_continue) => self.label(),
'@' if self.s.at(is_id_continue) => self.reference(),
- '0'..='9' => self.numbering(start),
'#' if self.s.eat_if('{') => SyntaxKind::LeftBrace,
'#' if self.s.eat_if('[') => SyntaxKind::LeftBracket,
'#' if self.s.at(is_id_start) => {
@@ -225,63 +196,28 @@ impl Lexer<'_> {
'\'' => SyntaxKind::SmartQuote,
'"' => SyntaxKind::SmartQuote,
'$' => SyntaxKind::Dollar,
- '=' => SyntaxKind::Eq,
- '+' => SyntaxKind::Plus,
- '/' => SyntaxKind::Slash,
'~' => SyntaxKind::Shorthand,
':' => SyntaxKind::Colon,
- '-' => SyntaxKind::Minus,
-
- _ => self.text(),
- }
- }
-
- fn text(&mut self) -> SyntaxKind {
- macro_rules! table {
- ($(|$c:literal)*) => {
- static TABLE: [bool; 128] = {
- let mut t = [false; 128];
- $(t[$c as usize] = true;)*
- t
- };
- };
- }
-
- table! {
- | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
- | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
- | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
- };
-
- loop {
- self.s.eat_until(|c: char| {
- TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
- });
-
- // Continue with the same text node if the thing would become text
- // anyway.
- let mut s = self.s;
- match s.eat() {
- Some(' ') if s.at(char::is_alphanumeric) => {}
- Some('/') if !s.at(['/', '*']) => {}
- Some('-') if !s.at(['-', '?']) => {}
- Some('.') if !s.at("..") => {}
- Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
- Some('@' | '#') if !s.at(is_id_start) => {}
- _ => break,
+ '=' => {
+ self.s.eat_while('=');
+ if self.space_and_more() {
+ SyntaxKind::HeadingMarker
+ } else {
+ self.text()
+ }
}
+ '-' if self.space_and_more() => SyntaxKind::ListMarker,
+ '+' if self.space_and_more() => SyntaxKind::EnumMarker,
+ '/' if self.space_and_more() => SyntaxKind::TermMarker,
- self.s = s;
+ _ => self.text(),
}
-
- SyntaxKind::Text
}
fn backslash(&mut self) -> SyntaxKind {
if self.s.eat_if("u{") {
let hex = self.s.eat_while(char::is_ascii_alphanumeric);
if !self.s.eat_if('}') {
- self.terminated = false;
return self.error_at_end("expected closing brace");
}
@@ -324,33 +260,14 @@ impl Lexer<'_> {
}
}
- fn link(&mut self) -> SyntaxKind {
- #[rustfmt::skip]
- self.s.eat_while(|c: char| matches!(c,
- | '0' ..= '9'
- | 'a' ..= 'z'
- | 'A' ..= 'Z'
- | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
- | '\'' | '.' | ',' | ';'
- ));
-
- if self.s.scout(-1) == Some('.') {
- self.s.uneat();
- }
-
- SyntaxKind::Link
- }
-
fn raw(&mut self) -> SyntaxKind {
- let column = self.column(self.s.cursor() - 1);
-
let mut backticks = 1;
while self.s.eat_if('`') {
backticks += 1;
}
if backticks == 2 {
- return SyntaxKind::Raw { column };
+ return SyntaxKind::Raw;
}
let mut found = 0;
@@ -363,7 +280,6 @@ impl Lexer<'_> {
}
if found != backticks {
- self.terminated = false;
let remaining = backticks - found;
let noun = if remaining == 1 { "backtick" } else { "backticks" };
return self.error_at_end(if found == 0 {
@@ -373,7 +289,24 @@ impl Lexer<'_> {
});
}
- SyntaxKind::Raw { column }
+ SyntaxKind::Raw
+ }
+
+ fn link(&mut self) -> SyntaxKind {
+ #[rustfmt::skip]
+ self.s.eat_while(|c: char| matches!(c,
+ | '0' ..= '9'
+ | 'a' ..= 'z'
+ | 'A' ..= 'Z'
+ | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
+ | '\'' | '.' | ',' | ';'
+ ));
+
+ if self.s.scout(-1) == Some('.') {
+ self.s.uneat();
+ }
+
+ SyntaxKind::Link
}
fn numbering(&mut self, start: usize) -> SyntaxKind {
@@ -386,23 +319,86 @@ impl Lexer<'_> {
return self.error("must be positive");
}
- return SyntaxKind::EnumNumbering;
+ return SyntaxKind::EnumMarker;
}
}
self.text()
}
+ fn label(&mut self) -> SyntaxKind {
+ let label = self.s.eat_while(is_id_continue);
+ if label.is_empty() {
+ return self.error("label cannot be empty");
+ }
+
+ if !self.s.eat_if('>') {
+ return self.error_at_end("expected closing angle bracket");
+ }
+
+ SyntaxKind::Label
+ }
+
fn reference(&mut self) -> SyntaxKind {
self.s.eat_while(is_id_continue);
SyntaxKind::Ref
}
+ fn text(&mut self) -> SyntaxKind {
+ macro_rules! table {
+ ($(|$c:literal)*) => {
+ static TABLE: [bool; 128] = {
+ let mut t = [false; 128];
+ $(t[$c as usize] = true;)*
+ t
+ };
+ };
+ }
+
+ table! {
+ | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
+ | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
+ | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
+ };
+
+ loop {
+ self.s.eat_until(|c: char| {
+ TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
+ });
+
+ // Continue with the same text node if the thing would become text
+ // anyway.
+ let mut s = self.s;
+ match s.eat() {
+ Some(' ') if s.at(char::is_alphanumeric) => {}
+ Some('/') if !s.at(['/', '*']) => {}
+ Some('-') if !s.at(['-', '?']) => {}
+ Some('.') if !s.at("..") => {}
+ Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
+ Some('@' | '#') if !s.at(is_id_start) => {}
+ _ => break,
+ }
+
+ self.s = s;
+ }
+
+ SyntaxKind::Text
+ }
+
fn in_word(&self) -> bool {
- let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+ let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
let prev = self.s.scout(-2);
let next = self.s.peek();
- alphanumeric(prev) && alphanumeric(next)
+ alphanum(prev) && alphanum(next)
+ }
+
+ fn space_and_more(&self) -> bool {
+ let mut s = self.s;
+ if !s.at(char::is_whitespace) {
+ return false;
+ }
+ s.eat_while(|c: char| c.is_whitespace() && !is_newline(c));
+ !s.done() && !s.at(is_newline)
}
}
@@ -586,26 +582,11 @@ impl Lexer<'_> {
});
if !self.s.eat_if('"') {
- self.terminated = false;
return self.error_at_end("expected quote");
}
SyntaxKind::Str
}
-
- fn label(&mut self) -> SyntaxKind {
- let label = self.s.eat_while(is_id_continue);
- if label.is_empty() {
- return self.error("label cannot be empty");
- }
-
- if !self.s.eat_if('>') {
- self.terminated = false;
- return self.error_at_end("expected closing angle bracket");
- }
-
- SyntaxKind::Label
- }
}
/// Try to parse an identifier into a keyword.
@@ -632,34 +613,6 @@ fn keyword(ident: &str) -> Option<SyntaxKind> {
})
}
-/// The column index of a given index in the source string, given a column
-/// offset for the first line.
-fn column(string: &str, index: usize, offset: usize) -> usize {
- let mut apply_offset = false;
- let res = string[..index]
- .char_indices()
- .rev()
- .take_while(|&(_, c)| !is_newline(c))
- .inspect(|&(i, _)| {
- if i == 0 {
- apply_offset = true
- }
- })
- .count();
-
- // The loop is never executed if the slice is empty, but we are of
- // course still at the start of the first line.
- if index == 0 {
- apply_offset = true;
- }
-
- if apply_offset {
- res + offset
- } else {
- res
- }
-}
-
/// Whether this character denotes a newline.
#[inline]
pub fn is_newline(character: char) -> bool {
@@ -695,6 +648,21 @@ pub(super) fn split_newlines(text: &str) -> Vec<&str> {
lines
}
+/// Count the number of newlines in text.
+fn count_newlines(text: &str) -> usize {
+ let mut newlines = 0;
+ let mut s = Scanner::new(text);
+ while let Some(c) = s.eat() {
+ if is_newline(c) {
+ if c == '\r' {
+ s.eat_if('\n');
+ }
+ newlines += 1;
+ }
+ }
+ newlines
+}
+
/// Whether a string is a valid unicode identifier.
///
/// In addition to what is specified in the [Unicode Standard][uax31], we allow: