summaryrefslogtreecommitdiff
path: root/src/syntax
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2019-10-30 21:13:12 +0100
committerLaurenz <laurmaedje@gmail.com>2019-10-30 21:13:12 +0100
commitb5d8b8f4a5425ec7bcaa50d8394e76cffe4baadc (patch)
tree23b26258ac200b9150b7485ef95b51d6cf9b7412 /src/syntax
parentccc4639c7d4dfe039d469d16236ac5ad121f4a07 (diff)
Token spans πŸ”œπŸ”™
Diffstat (limited to 'src/syntax')
-rw-r--r--src/syntax/mod.rs26
-rw-r--r--src/syntax/parsing.rs21
-rw-r--r--src/syntax/tokens.rs182
3 files changed, 142 insertions, 87 deletions
diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs
index da8cdc80..8a6329de 100644
--- a/src/syntax/mod.rs
+++ b/src/syntax/mod.rs
@@ -129,3 +129,29 @@ impl Display for Expression {
}
}
}
+
+pub struct Spanned<T> {
+ pub val: T,
+ pub span: Span,
+}
+
+impl<T> Spanned<T> {
+ pub fn new(val: T, span: Span) -> Spanned<T> {
+ Spanned { val, span }
+ }
+}
+
+pub struct Span {
+ pub start: usize,
+ pub end: usize,
+}
+
+impl Span {
+ pub fn new(start: usize, end: usize) -> Span {
+ Span { start, end }
+ }
+
+ pub fn at(index: usize) -> Span {
+ Span { start: index, end: index + 1 }
+ }
+}
diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs
index 1e949729..f4013f2b 100644
--- a/src/syntax/parsing.rs
+++ b/src/syntax/parsing.rs
@@ -217,12 +217,9 @@ impl<'s> Parser<'s> {
// Do the parsing dependent on whether the function has a body.
Ok(if has_body {
// Find out the string which makes the body of this function.
- let (start, end) = self
- .tokens
- .string_index()
- .and_then(|index| {
- find_closing_bracket(&self.src[index..]).map(|end| (index, index + end))
- })
+ let start = self.tokens.string_index();
+ let end = find_closing_bracket(&self.src[start..])
+ .map(|end| start + end)
.ok_or_else(|| ParseError::new("expected closing bracket"))?;
// Parse the body.
@@ -370,17 +367,15 @@ impl<'s> PeekableTokens<'s> {
/// Peek at the next element.
fn peek(&mut self) -> Option<Token<'s>> {
let iter = &mut self.tokens;
- *self.peeked.get_or_insert_with(|| iter.next())
+ *self.peeked.get_or_insert_with(|| iter.next().map(|token| token.val))
}
- /// The index of the first character of the next token in the source string.
- fn string_index(&mut self) -> Option<usize> {
- self.tokens.chars.string_index()
+ fn string_index(&mut self) -> usize {
+ self.tokens.string_index()
}
- /// Go to a new position in the underlying string.
fn set_string_index(&mut self, index: usize) {
- self.tokens.chars.set_string_index(index);
+ self.tokens.set_string_index(index);
self.peeked = None;
}
}
@@ -391,7 +386,7 @@ impl<'s> Iterator for PeekableTokens<'s> {
fn next(&mut self) -> Option<Token<'s>> {
match self.peeked.take() {
Some(value) => value,
- None => self.tokens.next(),
+ None => self.tokens.next().map(|token| token.val),
}
}
}
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index 460884a4..fbf4c3ac 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -12,7 +12,7 @@ pub fn tokenize(src: &str) -> Tokens {
#[derive(Debug, Clone)]
pub struct Tokens<'s> {
src: &'s str,
- pub(super) chars: PeekableChars<'s>,
+ chars: PeekableChars<'s>,
state: TokensState,
stack: SmallVec<[TokensState; 1]>,
}
@@ -31,7 +31,7 @@ enum TokensState {
impl<'s> Tokens<'s> {
/// Create a new token stream from source code.
- fn new(src: &'s str) -> Tokens<'s> {
+ pub fn new(src: &'s str) -> Tokens<'s> {
Tokens {
src,
chars: PeekableChars::new(src),
@@ -40,6 +40,16 @@ impl<'s> Tokens<'s> {
}
}
+ /// The index of the first character of the next token in the source string.
+ pub fn string_index(&mut self) -> usize {
+ self.chars.string_index()
+ }
+
+ /// Go to a new position in the underlying string.
+ pub fn set_string_index(&mut self, index: usize) {
+ self.chars.set_string_index(index);
+ }
+
/// Advance the iterator by one step.
fn advance(&mut self) {
self.chars.next();
@@ -55,81 +65,69 @@ impl<'s> Tokens<'s> {
fn unswitch(&mut self) {
self.state = self.stack.pop().unwrap_or(TokensState::Body);
}
-
- /// Advance and return the given token.
- fn consumed(&mut self, token: Token<'s>) -> Token<'s> {
- self.advance();
- token
- }
-
- /// Returns a word containing the string bounded by the given indices.
- fn text(&self, start: usize, end: usize) -> Token<'s> {
- Token::Text(&self.src[start..end])
- }
}
impl<'s> Iterator for Tokens<'s> {
- type Item = Token<'s>;
+ type Item = Spanned<Token<'s>>;
/// Advance the iterator, return the next token or nothing.
- fn next(&mut self) -> Option<Token<'s>> {
- use TokensState as TU;
+ fn next(&mut self) -> Option<Self::Item> {
+ use TokensState as TS;
// Go to the body state if the function has a body or return to the top-of-stack
// state.
- if self.state == TU::MaybeBody {
- if self.chars.peek()?.1 == '[' {
- self.state = TU::Body;
- return Some(self.consumed(Token::LeftBracket));
+ if self.state == TS::MaybeBody {
+ if let Some((index, '[')) = self.chars.peek() {
+ self.advance();
+ self.state = TS::Body;
+ return Some(Spanned::new(Token::LeftBracket, Span::at(index)));
} else {
self.unswitch();
}
}
// Take the next char and peek at the one behind.
- let (next_pos, next) = self.chars.next()?;
- let afterwards = self.chars.peek().map(|p| p.1);
+ let (pos, next) = self.chars.next()?;
+ let afterwards = self.chars.peekc();
- Some(match next {
+ let token = match next {
// Functions
'[' => {
- self.switch(TU::Function);
+ self.switch(TS::Function);
Token::LeftBracket
}
']' => {
- if self.state == TU::Function {
- self.state = TU::MaybeBody;
+ if self.state == TS::Function {
+ self.state = TS::MaybeBody;
} else {
self.unswitch();
}
+
Token::RightBracket
}
// Line comment
'/' if afterwards == Some('/') => {
- let mut end = self.chars.next().unwrap();
- let start = end.0 + end.1.len_utf8();
+ let start = self.string_index() + 1;
- while let Some((index, c)) = self.chars.peek() {
+ while let Some(c) = self.chars.peekc() {
if is_newline_char(c) {
break;
}
self.advance();
- end = (index, c);
}
- let end = end.0 + end.1.len_utf8();
+ let end = self.string_index();
Token::LineComment(&self.src[start..end])
}
// Block comment
'/' if afterwards == Some('*') => {
- let mut end = self.chars.next().unwrap();
- let start = end.0 + end.1.len_utf8();
-
+ let start = self.string_index() + 1;
let mut nested = 0;
- while let Some((index, c)) = self.chars.next() {
- let after = self.chars.peek().map(|p| p.1);
+
+ while let Some((_, c)) = self.chars.next() {
+ let after = self.chars.peekc();
match (c, after) {
('*', Some('/')) if nested == 0 => {
self.advance();
@@ -145,58 +143,62 @@ impl<'s> Iterator for Tokens<'s> {
}
_ => {}
}
- end = (index, c);
}
- let end = end.0 + end.1.len_utf8();
+ let end = self.string_index() - 2;
Token::BlockComment(&self.src[start..end])
}
// Unexpected end of block comment
- '*' if afterwards == Some('/') => self.consumed(Token::StarSlash),
+ '*' if afterwards == Some('/') => {
+ self.advance();
+ Token::StarSlash
+ }
// Whitespace
' ' | '\t' => {
- while let Some((_, c)) = self.chars.peek() {
+ while let Some(c) = self.chars.peekc() {
match c {
' ' | '\t' => self.advance(),
_ => break,
}
}
+
Token::Space
}
// Newlines
- '\r' if afterwards == Some('\n') => self.consumed(Token::Newline),
+ '\r' if afterwards == Some('\n') => {
+ self.advance();
+ Token::Newline
+ },
c if is_newline_char(c) => Token::Newline,
// Star/Underscore/Backtick in bodies
- '*' if self.state == TU::Body => Token::Star,
- '_' if self.state == TU::Body => Token::Underscore,
- '`' if self.state == TU::Body => Token::Backtick,
+ '*' if self.state == TS::Body => Token::Star,
+ '_' if self.state == TS::Body => Token::Underscore,
+ '`' if self.state == TS::Body => Token::Backtick,
// Context sensitive operators in headers
- ':' if self.state == TU::Function => Token::Colon,
- '=' if self.state == TU::Function => Token::Equals,
- ',' if self.state == TU::Function => Token::Comma,
+ ':' if self.state == TS::Function => Token::Colon,
+ '=' if self.state == TS::Function => Token::Equals,
+ ',' if self.state == TS::Function => Token::Comma,
// A string value.
- '"' if self.state == TU::Function => {
- // Find out when the word ends.
+ '"' if self.state == TS::Function => {
+ let start = self.string_index();
let mut escaped = false;
- let mut end = (next_pos, next);
- while let Some((index, c)) = self.chars.next() {
+ while let Some((_, c)) = self.chars.next() {
if c == '"' && !escaped {
break;
}
escaped = c == '\\';
- end = (index, c);
}
- let end_pos = end.0 + end.1.len_utf8();
- Token::Quoted(&self.src[next_pos + 1..end_pos])
+ let end = self.string_index() - 1;
+ Token::Quoted(&self.src[start..end])
}
// Escaping
@@ -209,25 +211,26 @@ impl<'s> Iterator for Tokens<'s> {
if escapable {
self.advance();
- return Some(self.text(index, index + c.len_utf8()));
+ Token::Text(&self.src[index..index + c.len_utf8()])
+ } else {
+ Token::Text("\\")
}
+ } else {
+ Token::Text("\\")
}
-
- Token::Text("\\")
}
// Normal text
_ => {
// Find out when the word ends.
- let mut end = (next_pos, next);
- while let Some((index, c)) = self.chars.peek() {
+ while let Some((_, c)) = self.chars.peek() {
let second = self.chars.peekn(1).map(|p| p.1);
// Whether the next token is still from the text or not.
let continues = match c {
'[' | ']' | '\\' => false,
- '*' | '_' | '`' if self.state == TU::Body => false,
- ':' | '=' | ',' | '"' if self.state == TU::Function => false,
+ '*' | '_' | '`' if self.state == TS::Body => false,
+ ':' | '=' | ',' | '"' if self.state == TS::Function => false,
'/' => second != Some('/') && second != Some('*'),
'*' => second != Some('/'),
@@ -242,14 +245,15 @@ impl<'s> Iterator for Tokens<'s> {
break;
}
- end = (index, c);
self.advance();
}
- let end_pos = end.0 + end.1.len_utf8();
- self.text(next_pos, end_pos)
+ let end = self.string_index();
+ Token::Text(&self.src[pos..end])
}
- })
+ };
+
+ Some(Spanned::new(token, Span::new(pos, self.string_index())))
}
}
@@ -266,8 +270,9 @@ fn is_newline_char(character: char) -> bool {
pub struct PeekableChars<'s> {
string: &'s str,
chars: CharIndices<'s>,
- base: usize,
peeked: SmallVec<[Option<(usize, char)>; 2]>,
+ base: usize,
+ index: usize,
}
impl<'s> PeekableChars<'s> {
@@ -276,8 +281,9 @@ impl<'s> PeekableChars<'s> {
PeekableChars {
string,
chars: string.char_indices(),
- base: 0,
peeked: SmallVec::new(),
+ base: 0,
+ index: 0,
}
}
@@ -286,6 +292,11 @@ impl<'s> PeekableChars<'s> {
self.peekn(0)
}
+ /// Peek at the char of the next element.
+ pub fn peekc(&mut self) -> Option<char> {
+ self.peekn(0).map(|p| p.1)
+ }
+
/// Peek at the element after the next element.
pub fn peekn(&mut self, n: usize) -> Option<(usize, char)> {
while self.peeked.len() <= n {
@@ -298,18 +309,17 @@ impl<'s> PeekableChars<'s> {
/// Return the next value of the inner iterator mapped with the offset.
pub fn next_inner(&mut self) -> Option<(usize, char)> {
- self.chars.next().map(|(i, c)| (i + self.base, c))
+ self.chars.next().map(|(i, c)| (self.base + i, c))
}
- /// The index of the first character of the next token in the source string.
- pub fn string_index(&mut self) -> Option<usize> {
- self.peek().map(|p| p.0)
+ pub fn string_index(&mut self) -> usize {
+ self.index
}
- /// Go to a new position in the underlying string.
pub fn set_string_index(&mut self, index: usize) {
self.chars = self.string[index..].char_indices();
self.base = index;
+ self.index = 0;
self.peeked.clear();
}
}
@@ -318,11 +328,17 @@ impl Iterator for PeekableChars<'_> {
type Item = (usize, char);
fn next(&mut self) -> Option<(usize, char)> {
- if !self.peeked.is_empty() {
+ let next = if !self.peeked.is_empty() {
self.peeked.remove(0)
} else {
self.next_inner()
+ };
+
+ if let Some((index, c)) = next {
+ self.index = index + c.len_utf8();
}
+
+ next
}
}
@@ -337,7 +353,16 @@ mod tests {
/// Test if the source code tokenizes to the tokens.
fn test(src: &str, tokens: Vec<Token>) {
- assert_eq!(Tokens::new(src).collect::<Vec<_>>(), tokens);
+ assert_eq!(Tokens::new(src)
+ .map(|token| token.val)
+ .collect::<Vec<_>>(), tokens);
+ }
+
+ /// Test if the tokens of the source code have the correct spans.
+ fn test_span(src: &str, spans: Vec<(usize, usize)>) {
+ assert_eq!(Tokens::new(src)
+ .map(|token| (token.span.start, token.span.end))
+ .collect::<Vec<_>>(), spans);
}
/// Tokenizes the basic building blocks.
@@ -462,4 +487,13 @@ mod tests {
test("[document][Hello 🌍!]", vec![L, T("document"), R, L, T("Hello"), S, T("🌍!"), R]);
test("[f]⺐.", vec![L, T("f"), R, T("⺐.")]);
}
+
+ /// This test checks if all tokens have the correct spans.
+ #[test]
+ #[rustfmt::skip]
+ fn tokenize_spans() {
+ test_span("Hello World", vec![(0, 5), (5, 6), (6, 11)]);
+ test_span("🌍_🎈", vec![(0, 4), (4, 5), (5, 9)]);
+ test_span("[hello: world]", vec![(0, 1), (1, 6), (6, 7), (7, 8), (8, 13), (13, 14)]);
+ }
}