From 6527d31dfba78330a39e52d7772f6c8561fb23ef Mon Sep 17 00:00:00 2001 From: Laurenz Date: Mon, 13 Jan 2020 13:02:33 +0100 Subject: =?UTF-8?q?Merge=20Characters=20struct=20into=20tokenizer=20?= =?UTF-8?q?=F0=9F=94=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/syntax/expr.rs | 21 --------- src/syntax/tokens.rs | 127 +++++++++++++++++++++++++-------------------------- 2 files changed, 63 insertions(+), 85 deletions(-) (limited to 'src/syntax') diff --git a/src/syntax/expr.rs b/src/syntax/expr.rs index a1680861..ed5e50df 100644 --- a/src/syntax/expr.rs +++ b/src/syntax/expr.rs @@ -166,27 +166,6 @@ impl Display for Ident { debug_display!(Ident); -/// Whether this word is a valid identifier. -pub fn is_identifier(string: &str) -> bool { - let mut chars = string.chars(); - - match chars.next() { - Some('-') => {} - Some(c) if UnicodeXID::is_xid_start(c) => {} - _ => return false, - } - - while let Some(c) = chars.next() { - match c { - '.' | '-' => {} - c if UnicodeXID::is_xid_continue(c) => {} - _ => return false, - } - } - - true -} - /// Kinds of expressions. pub trait ExpressionKind: Sized { const NAME: &'static str; diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs index d355b3cc..ae5cfe48 100644 --- a/src/syntax/tokens.rs +++ b/src/syntax/tokens.rs @@ -72,9 +72,11 @@ pub fn tokenize(src: &str) -> Tokens { /// An iterator over the tokens of a string of source code. pub struct Tokens<'s> { src: &'s str, - chars: Characters<'s>, state: State, stack: Vec, + iter: Peekable>, + position: Position, + index: usize, } #[derive(Debug, Copy, Clone, Eq, PartialEq)] @@ -88,9 +90,11 @@ impl<'s> Tokens<'s> { pub fn new(src: &'s str) -> Tokens<'s> { Tokens { src, - chars: Characters::new(src), state: State::Body, stack: vec![], + iter: src.chars().peekable(), + position: Position::ZERO, + index: 0, } } } @@ -100,26 +104,29 @@ impl<'s> Iterator for Tokens<'s> { /// Parse the next token in the source code. fn next(&mut self) -> Option>> { - let start = self.chars.position(); - let first = self.chars.next()?; - let second = self.chars.peek(); + let start = self.pos(); + let first = self.eat()?; let token = match first { // Comments. - '/' if second == Some('/') => self.parse_line_comment(), - '/' if second == Some('*') => self.parse_block_comment(), - '*' if second == Some('/') => { self.eat(); StarSlash } + '/' if self.peek() == Some('/') => self.parse_line_comment(), + '/' if self.peek() == Some('*') => self.parse_block_comment(), + '*' if self.peek() == Some('/') => { self.eat(); StarSlash } // Whitespace. c if c.is_whitespace() => self.parse_whitespace(start), // Functions. - '[' => { self.set_state(Header); LeftBracket } + '[' => { + self.stack.push(self.state); + self.state = Header; + LeftBracket + } ']' => { - if self.state == Header && second == Some('[') { + if self.state == Header && self.peek() == Some('[') { self.state = StartBody; } else { - self.pop_state(); + self.state = self.stack.pop().unwrap_or(Body); } RightBracket @@ -164,7 +171,7 @@ impl<'s> Iterator for Tokens<'s> { } }; - let end = self.chars.position(); + let end = self.pos(); let span = Span { start, end }; Some(Spanned { v: token, span }) @@ -206,7 +213,7 @@ impl<'s> Tokens<'s> { fn parse_whitespace(&mut self, start: Position) -> Token<'s> { self.read_string_until(|n| !n.is_whitespace(), false, 0, 0); - let end = self.chars.position(); + let end = self.pos(); Whitespace(end.line - start.line) } @@ -234,9 +241,9 @@ impl<'s> Tokens<'s> { } } - let c = self.chars.peek().unwrap_or('n'); + let c = self.peek().unwrap_or('n'); if self.state == Body && is_escapable(c) { - let index = self.chars.index(); + let index = self.index(); self.eat(); Text(&self.src[index .. index + c.len_utf8()]) } else { @@ -267,22 +274,22 @@ impl<'s> Tokens<'s> { offset_start: isize, offset_end: isize, ) -> &'s str where F: FnMut(char) -> bool { - let start = ((self.chars.index() as isize) + offset_start) as usize; + let start = ((self.index() as isize) + offset_start) as usize; let mut matched = false; - while let Some(c) = self.chars.peek() { + while let Some(c) = self.peek() { if f(c) { matched = true; if eat_match { - self.chars.next(); + self.eat(); } break; } - self.chars.next(); + self.eat(); } - let mut end = self.chars.index(); + let mut end = self.index(); if matched { end = ((end as isize) + offset_end) as usize; } @@ -290,17 +297,32 @@ impl<'s> Tokens<'s> { &self.src[start .. end] } - fn set_state(&mut self, state: State) { - self.stack.push(self.state); - self.state = state; + fn eat(&mut self) -> Option { + let c = self.iter.next()?; + let len = c.len_utf8(); + + self.index += len; + + if is_newline_char(c) && !(c == '\r' && self.peek() == Some('\n')) { + self.position.line += 1; + self.position.column = 0; + } else { + self.position.column += len; + } + + Some(c) + } + + fn peek(&mut self) -> Option { + self.iter.peek().copied() } - fn pop_state(&mut self) { - self.state = self.stack.pop().unwrap_or(Body); + fn index(&self) -> usize { + self.index } - fn eat(&mut self) { - self.chars.next(); + fn pos(&self) -> Position { + self.position } } @@ -313,7 +335,7 @@ fn parse_percentage(text: &str) -> Option { } /// Whether this character denotes a newline. -fn is_newline_char(character: char) -> bool { +pub fn is_newline_char(character: char) -> bool { match character { // Line Feed, Vertical Tab, Form Feed, Carriage Return. '\x0A' ..= '\x0D' => true, @@ -323,46 +345,23 @@ fn is_newline_char(character: char) -> bool { } } -struct Characters<'s> { - iter: Peekable>, - position: Position, - index: usize, -} +/// Whether this word is a valid identifier. +pub fn is_identifier(string: &str) -> bool { + let mut chars = string.chars(); -impl<'s> Characters<'s> { - fn new(src: &'s str) -> Characters<'s> { - Characters { - iter: src.chars().peekable(), - position: Position::ZERO, - index: 0, - } + match chars.next() { + Some('-') => {} + Some(c) if UnicodeXID::is_xid_start(c) => {} + _ => return false, } - fn next(&mut self) -> Option { - let c = self.iter.next()?; - let len = c.len_utf8(); - - self.index += len; - - if is_newline_char(c) && !(c == '\r' && self.peek() == Some('\n')) { - self.position.line += 1; - self.position.column = 0; - } else { - self.position.column += len; + while let Some(c) = chars.next() { + match c { + '.' | '-' => {} + c if UnicodeXID::is_xid_continue(c) => {} + _ => return false, } - - Some(c) - } - - fn peek(&mut self) -> Option { - self.iter.peek().copied() } - fn index(&self) -> usize { - self.index - } - - fn position(&self) -> Position { - self.position - } + true } -- cgit v1.2.3