diff options
| author | Ian Wrzesinski <wrzian@umich.edu> | 2024-10-10 11:57:27 -0400 |
|---|---|---|
| committer | Ian Wrzesinski <wrzian@umich.edu> | 2024-10-31 18:26:12 -0400 |
| commit | 1cecae0333efcdfcfcca8e4e97ef590297808c2e (patch) | |
| tree | 756f7978a2671c894ab1ba2c50c76534cc21e524 /crates/typst-syntax/src/lexer.rs | |
| parent | 01186779cd92a7bad6ebff9154a85c6ab86cf7cb (diff) | |
7. Return SyntaxNodes from the Lexer
Diffstat (limited to 'crates/typst-syntax/src/lexer.rs')
| -rw-r--r-- | crates/typst-syntax/src/lexer.rs | 44 |
1 files changed, 25 insertions, 19 deletions
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs index 721225c6..cdd4121c 100644 --- a/crates/typst-syntax/src/lexer.rs +++ b/crates/typst-syntax/src/lexer.rs @@ -4,12 +4,12 @@ use unicode_script::{Script, UnicodeScript}; use unicode_segmentation::UnicodeSegmentation; use unscanny::Scanner; -use crate::{SyntaxError, SyntaxKind}; +use crate::{SyntaxError, SyntaxKind, SyntaxNode}; -/// Splits up a string of source code into tokens. +/// An iterator over a source code string which returns tokens. #[derive(Clone)] pub(super) struct Lexer<'s> { - /// The underlying scanner. + /// The scanner: contains the underlying string and location as a "cursor". s: Scanner<'s>, /// The mode the lexer is in. This determines which kinds of tokens it /// produces. @@ -73,11 +73,6 @@ impl<'s> Lexer<'s> { pub fn newline(&self) -> bool { self.newline } - - /// Take out the last error, if any. - pub fn take_error(&mut self) -> Option<SyntaxError> { - self.error.take() - } } impl Lexer<'_> { @@ -97,21 +92,24 @@ impl Lexer<'_> { /// Shared methods with all [`LexMode`]. impl Lexer<'_> { - /// Proceed to the next token and return its [`SyntaxKind`]. Note the - /// token could be a [trivia](SyntaxKind::is_trivia). - pub fn next(&mut self) -> SyntaxKind { + /// Return the next token in our text. Returns both the [`SyntaxNode`] + /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind + pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) { + debug_assert!(self.error.is_none()); + let start = self.s.cursor(); if self.mode == LexMode::Raw { - let Some((kind, end)) = self.raw.pop() else { - return SyntaxKind::End; + let kind = if let Some((kind, end)) = self.raw.pop() { + self.s.jump(end); + kind + } else { + SyntaxKind::End }; - self.s.jump(end); - return kind; + let node = SyntaxNode::leaf(kind, self.s.from(start)); + return (kind, node); } self.newline = false; - self.error = None; - let start = self.s.cursor(); - match self.s.eat() { + let kind = match self.s.eat() { Some(c) if is_space(c, self.mode) => self.whitespace(start, c), Some('/') if self.s.eat_if('/') => self.line_comment(), Some('/') if self.s.eat_if('*') => self.block_comment(), @@ -132,13 +130,21 @@ impl Lexer<'_> { }, None => SyntaxKind::End, - } + }; + + let text = self.s.from(start); + let node = match self.error.take() { + Some(error) => SyntaxNode::error(error, text), + None => SyntaxNode::leaf(kind, text), + }; + (kind, node) } /// Eat whitespace characters greedily. fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind { let more = self.s.eat_while(|c| is_space(c, self.mode)); let newlines = match c { + // Optimize eating a single space. ' ' if more.is_empty() => 0, _ => count_newlines(self.s.from(start)), }; |
