diff options
| author | Laurenz <laurmaedje@gmail.com> | 2023-01-20 14:05:17 +0100 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2023-01-20 14:05:35 +0100 |
| commit | dd331f007cb9c9968605f8d3eaef8fb498c21322 (patch) | |
| tree | f1b1490758ec53fd204724a325158d16c980d131 /src/syntax/lexer.rs | |
| parent | 40561e57fbbc68becac07acd54a34f94f591f277 (diff) | |
Rewrite parser
Diffstat (limited to 'src/syntax/lexer.rs')
| -rw-r--r-- | src/syntax/lexer.rs | 344 |
1 files changed, 156 insertions, 188 deletions
diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs index f082bd28..e3c29150 100644 --- a/src/syntax/lexer.rs +++ b/src/syntax/lexer.rs @@ -9,12 +9,11 @@ use crate::util::{format_eco, EcoString}; pub(super) struct Lexer<'s> { /// The underlying scanner. s: Scanner<'s>, - /// The mode the lexer is in. This determines what tokens it recognizes. + /// The mode the lexer is in. This determines which kinds of tokens it + /// produces. mode: LexMode, - /// Whether the last token has been terminated. - terminated: bool, - /// Offsets the indentation on the first line of the source. - column_offset: usize, + /// Whether the last token contained a newline. + newline: bool, /// An error for the last token. error: Option<(EcoString, ErrorPos)>, } @@ -33,12 +32,11 @@ pub(super) enum LexMode { impl<'s> Lexer<'s> { /// Create a new lexer with the given mode and a prefix to offset column /// calculations. - pub fn with_prefix(prefix: &str, text: &'s str, mode: LexMode) -> Self { + pub fn new(text: &'s str, mode: LexMode) -> Self { Self { s: Scanner::new(text), mode, - terminated: true, - column_offset: column(prefix, prefix.len(), 0), + newline: false, error: None, } } @@ -64,26 +62,18 @@ impl<'s> Lexer<'s> { self.s.jump(index); } - /// The underlying scanner. - pub fn scanner(&self) -> Scanner<'s> { - self.s - } - - /// Whether the last token was terminated. - pub fn terminated(&self) -> bool { - self.terminated + /// Whether the last token contained a newline. + pub fn newline(&self) -> bool { + self.newline } - /// The column index of a given index in the source string. - pub fn column(&self, index: usize) -> usize { - column(self.s.string(), index, self.column_offset) - } - - /// Take out the last error. - pub fn last_error(&mut self) -> Option<(EcoString, ErrorPos)> { + /// Take out the last error, if any. + pub fn take_error(&mut self) -> Option<(EcoString, ErrorPos)> { self.error.take() } +} +impl Lexer<'_> { /// Construct a full-positioned syntax error. fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind { self.error = Some((message.into(), ErrorPos::Full)); @@ -97,45 +87,53 @@ impl<'s> Lexer<'s> { } } -impl Iterator for Lexer<'_> { - type Item = SyntaxKind; - - /// Produce the next token. - fn next(&mut self) -> Option<Self::Item> { +/// Shared. +impl Lexer<'_> { + pub fn next(&mut self) -> SyntaxKind { + self.newline = false; self.error = None; let start = self.s.cursor(); - let c = self.s.eat()?; - Some(match c { - // Trivia. - c if c.is_whitespace() => self.whitespace(c), - '/' if self.s.eat_if('/') => self.line_comment(), - '/' if self.s.eat_if('*') => self.block_comment(), - '*' if self.s.eat_if('/') => self.error("unexpected end of block comment"), - - // Other things. - _ => match self.mode { + match self.s.eat() { + Some(c) if c.is_whitespace() => self.whitespace(start, c), + Some('/') if self.s.eat_if('/') => self.line_comment(), + Some('/') if self.s.eat_if('*') => self.block_comment(), + Some('*') if self.s.eat_if('/') => { + self.error("unexpected end of block comment") + } + + Some(c) => match self.mode { LexMode::Markup => self.markup(start, c), LexMode::Math => self.math(c), LexMode::Code => self.code(start, c), }, - }) + + None => SyntaxKind::Eof, + } + } + + fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { + let more = self.s.eat_while(char::is_whitespace); + let newlines = match c { + ' ' if more.is_empty() => 0, + _ => count_newlines(self.s.from(start)), + }; + + self.newline = newlines > 0; + if self.mode == LexMode::Markup && newlines >= 2 { + SyntaxKind::Parbreak + } else { + SyntaxKind::Space + } } -} -/// Shared. -impl Lexer<'_> { fn line_comment(&mut self) -> SyntaxKind { self.s.eat_until(is_newline); - if self.s.done() { - self.terminated = false; - } SyntaxKind::LineComment } fn block_comment(&mut self) -> SyntaxKind { let mut state = '_'; let mut depth = 1; - self.terminated = false; // Find the first `*/` that does not correspond to a nested `/*`. while let Some(c) = self.s.eat() { @@ -143,7 +141,6 @@ impl Lexer<'_> { ('*', '/') => { depth -= 1; if depth == 0 { - self.terminated = true; break; } '_' @@ -162,32 +159,6 @@ impl Lexer<'_> { SyntaxKind::BlockComment } - - fn whitespace(&mut self, c: char) -> SyntaxKind { - if c == ' ' && !self.s.at(char::is_whitespace) { - return SyntaxKind::Space { newlines: 0 }; - } - - self.s.uneat(); - - // Count the number of newlines. - let mut newlines = 0; - while let Some(c) = self.s.eat() { - if !c.is_whitespace() { - self.s.uneat(); - break; - } - - if is_newline(c) { - if c == '\r' { - self.s.eat_if('\n'); - } - newlines += 1; - } - } - - SyntaxKind::Space { newlines } - } } /// Markup. @@ -199,9 +170,9 @@ impl Lexer<'_> { '`' => self.raw(), 'h' if self.s.eat_if("ttp://") => self.link(), 'h' if self.s.eat_if("ttps://") => self.link(), + '0'..='9' => self.numbering(start), '<' if self.s.at(is_id_continue) => self.label(), '@' if self.s.at(is_id_continue) => self.reference(), - '0'..='9' => self.numbering(start), '#' if self.s.eat_if('{') => SyntaxKind::LeftBrace, '#' if self.s.eat_if('[') => SyntaxKind::LeftBracket, '#' if self.s.at(is_id_start) => { @@ -225,63 +196,28 @@ impl Lexer<'_> { '\'' => SyntaxKind::SmartQuote, '"' => SyntaxKind::SmartQuote, '$' => SyntaxKind::Dollar, - '=' => SyntaxKind::Eq, - '+' => SyntaxKind::Plus, - '/' => SyntaxKind::Slash, '~' => SyntaxKind::Shorthand, ':' => SyntaxKind::Colon, - '-' => SyntaxKind::Minus, - - _ => self.text(), - } - } - - fn text(&mut self) -> SyntaxKind { - macro_rules! table { - ($(|$c:literal)*) => { - static TABLE: [bool; 128] = { - let mut t = [false; 128]; - $(t[$c as usize] = true;)* - t - }; - }; - } - - table! { - | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' - | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' - | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' - }; - - loop { - self.s.eat_until(|c: char| { - TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) - }); - - // Continue with the same text node if the thing would become text - // anyway. - let mut s = self.s; - match s.eat() { - Some(' ') if s.at(char::is_alphanumeric) => {} - Some('/') if !s.at(['/', '*']) => {} - Some('-') if !s.at(['-', '?']) => {} - Some('.') if !s.at("..") => {} - Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} - Some('@' | '#') if !s.at(is_id_start) => {} - _ => break, + '=' => { + self.s.eat_while('='); + if self.space_and_more() { + SyntaxKind::HeadingMarker + } else { + self.text() + } } + '-' if self.space_and_more() => SyntaxKind::ListMarker, + '+' if self.space_and_more() => SyntaxKind::EnumMarker, + '/' if self.space_and_more() => SyntaxKind::TermMarker, - self.s = s; + _ => self.text(), } - - SyntaxKind::Text } fn backslash(&mut self) -> SyntaxKind { if self.s.eat_if("u{") { let hex = self.s.eat_while(char::is_ascii_alphanumeric); if !self.s.eat_if('}') { - self.terminated = false; return self.error_at_end("expected closing brace"); } @@ -324,33 +260,14 @@ impl Lexer<'_> { } } - fn link(&mut self) -> SyntaxKind { - #[rustfmt::skip] - self.s.eat_while(|c: char| matches!(c, - | '0' ..= '9' - | 'a' ..= 'z' - | 'A' ..= 'Z' - | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' - | '\'' | '.' | ',' | ';' - )); - - if self.s.scout(-1) == Some('.') { - self.s.uneat(); - } - - SyntaxKind::Link - } - fn raw(&mut self) -> SyntaxKind { - let column = self.column(self.s.cursor() - 1); - let mut backticks = 1; while self.s.eat_if('`') { backticks += 1; } if backticks == 2 { - return SyntaxKind::Raw { column }; + return SyntaxKind::Raw; } let mut found = 0; @@ -363,7 +280,6 @@ impl Lexer<'_> { } if found != backticks { - self.terminated = false; let remaining = backticks - found; let noun = if remaining == 1 { "backtick" } else { "backticks" }; return self.error_at_end(if found == 0 { @@ -373,7 +289,24 @@ impl Lexer<'_> { }); } - SyntaxKind::Raw { column } + SyntaxKind::Raw + } + + fn link(&mut self) -> SyntaxKind { + #[rustfmt::skip] + self.s.eat_while(|c: char| matches!(c, + | '0' ..= '9' + | 'a' ..= 'z' + | 'A' ..= 'Z' + | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '=' + | '\'' | '.' | ',' | ';' + )); + + if self.s.scout(-1) == Some('.') { + self.s.uneat(); + } + + SyntaxKind::Link } fn numbering(&mut self, start: usize) -> SyntaxKind { @@ -386,23 +319,86 @@ impl Lexer<'_> { return self.error("must be positive"); } - return SyntaxKind::EnumNumbering; + return SyntaxKind::EnumMarker; } } self.text() } + fn label(&mut self) -> SyntaxKind { + let label = self.s.eat_while(is_id_continue); + if label.is_empty() { + return self.error("label cannot be empty"); + } + + if !self.s.eat_if('>') { + return self.error_at_end("expected closing angle bracket"); + } + + SyntaxKind::Label + } + fn reference(&mut self) -> SyntaxKind { self.s.eat_while(is_id_continue); SyntaxKind::Ref } + fn text(&mut self) -> SyntaxKind { + macro_rules! table { + ($(|$c:literal)*) => { + static TABLE: [bool; 128] = { + let mut t = [false; 128]; + $(t[$c as usize] = true;)* + t + }; + }; + } + + table! { + | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/' + | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"' + | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#' + }; + + loop { + self.s.eat_until(|c: char| { + TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace()) + }); + + // Continue with the same text node if the thing would become text + // anyway. + let mut s = self.s; + match s.eat() { + Some(' ') if s.at(char::is_alphanumeric) => {} + Some('/') if !s.at(['/', '*']) => {} + Some('-') if !s.at(['-', '?']) => {} + Some('.') if !s.at("..") => {} + Some('h') if !s.at("ttp://") && !s.at("ttps://") => {} + Some('@' | '#') if !s.at(is_id_start) => {} + _ => break, + } + + self.s = s; + } + + SyntaxKind::Text + } + fn in_word(&self) -> bool { - let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); + let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric()); let prev = self.s.scout(-2); let next = self.s.peek(); - alphanumeric(prev) && alphanumeric(next) + alphanum(prev) && alphanum(next) + } + + fn space_and_more(&self) -> bool { + let mut s = self.s; + if !s.at(char::is_whitespace) { + return false; + } + s.eat_while(|c: char| c.is_whitespace() && !is_newline(c)); + !s.done() && !s.at(is_newline) } } @@ -586,26 +582,11 @@ impl Lexer<'_> { }); if !self.s.eat_if('"') { - self.terminated = false; return self.error_at_end("expected quote"); } SyntaxKind::Str } - - fn label(&mut self) -> SyntaxKind { - let label = self.s.eat_while(is_id_continue); - if label.is_empty() { - return self.error("label cannot be empty"); - } - - if !self.s.eat_if('>') { - self.terminated = false; - return self.error_at_end("expected closing angle bracket"); - } - - SyntaxKind::Label - } } /// Try to parse an identifier into a keyword. @@ -632,34 +613,6 @@ fn keyword(ident: &str) -> Option<SyntaxKind> { }) } -/// The column index of a given index in the source string, given a column -/// offset for the first line. -fn column(string: &str, index: usize, offset: usize) -> usize { - let mut apply_offset = false; - let res = string[..index] - .char_indices() - .rev() - .take_while(|&(_, c)| !is_newline(c)) - .inspect(|&(i, _)| { - if i == 0 { - apply_offset = true - } - }) - .count(); - - // The loop is never executed if the slice is empty, but we are of - // course still at the start of the first line. - if index == 0 { - apply_offset = true; - } - - if apply_offset { - res + offset - } else { - res - } -} - /// Whether this character denotes a newline. #[inline] pub fn is_newline(character: char) -> bool { @@ -695,6 +648,21 @@ pub(super) fn split_newlines(text: &str) -> Vec<&str> { lines } +/// Count the number of newlines in text. +fn count_newlines(text: &str) -> usize { + let mut newlines = 0; + let mut s = Scanner::new(text); + while let Some(c) = s.eat() { + if is_newline(c) { + if c == '\r' { + s.eat_if('\n'); + } + newlines += 1; + } + } + newlines +} + /// Whether a string is a valid unicode identifier. /// /// In addition to what is specified in the [Unicode Standard][uax31], we allow: |
