From 16f0bd430e0864a3bbd0139803e476be413cb3cb Mon Sep 17 00:00:00 2001 From: Laurenz Date: Thu, 1 Oct 2020 11:05:16 +0200 Subject: =?UTF-8?q?Rename=20CharParser=20to=20Scanner=20=E2=9C=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/parse/chars.rs | 171 --------------------------------------------------- src/parse/mod.rs | 4 +- src/parse/resolve.rs | 30 ++++----- src/parse/scanner.rs | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/parse/tokens.rs | 78 +++++++++++------------ src/syntax/lines.rs | 8 +-- 6 files changed, 231 insertions(+), 231 deletions(-) delete mode 100644 src/parse/chars.rs create mode 100644 src/parse/scanner.rs (limited to 'src') diff --git a/src/parse/chars.rs b/src/parse/chars.rs deleted file mode 100644 index 62d40771..00000000 --- a/src/parse/chars.rs +++ /dev/null @@ -1,171 +0,0 @@ -//! Low-level char parser. - -use std::fmt::{self, Debug, Formatter}; -use std::slice::SliceIndex; -use std::str::Chars; - -/// A low-level featureful char parser. -pub struct CharParser<'s> { - src: &'s str, - iter: Chars<'s>, - index: usize, -} - -impl<'s> CharParser<'s> { - /// Create a new char parser. - pub fn new(src: &'s str) -> Self { - Self { src, iter: src.chars(), index: 0 } - } - - /// Consume the next char. - pub fn eat(&mut self) -> Option { - let next = self.iter.next(); - if let Some(c) = next { - self.index += c.len_utf8(); - } - next - } - - /// Consume the next char if it is the given one. - /// - /// Returns whether the char was consumed. - pub fn eat_if(&mut self, c: char) -> bool { - // Don't decode the char twice through peek() and eat(). - // - // TODO: Benchmark this vs. the naive version. - if self.iter.next() == Some(c) { - self.index += c.len_utf8(); - true - } else { - self.reset(); - false - } - } - - /// Consume the next char, debug-asserting that it is the given one. - pub fn eat_assert(&mut self, c: char) { - let next = self.eat(); - debug_assert_eq!(next, Some(c)); - } - - /// Consume the next char, coalescing `\r\n` to just `\n`. - pub fn eat_merging_crlf(&mut self) -> Option { - let c = self.eat(); - if c == Some('\r') && self.eat_if('\n') { - Some('\n') - } else { - c - } - } - - /// Eat chars while the condition is true. - pub fn eat_while(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str { - self.eat_until(|c| !f(c)) - } - - /// Eat chars until the condition is true. - pub fn eat_until(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str { - let start = self.index; - while let Some(c) = self.iter.next() { - if f(c) { - // Undo the previous `next()` without peeking all the time - // during iteration. - // - // TODO: Benchmark this vs. the naive peeking version. - self.reset(); - break; - } - self.index += c.len_utf8(); - } - &self.src[start .. self.index] - } - - /// Uneat the last eaten character. - pub fn uneat(&mut self) { - self.index = self.prev_index(); - self.reset(); - } - - /// Peek at the next char without consuming it. - pub fn peek(&self) -> Option { - self.iter.clone().next() - } - - /// Peek at the nth-next char without consuming anything. - pub fn peek_nth(&self, n: usize) -> Option { - self.iter.clone().nth(n) - } - - /// Checks whether the next character fulfills a condition. - /// - /// Returns `false` is there is no next character. - pub fn check(&self, f: impl FnMut(char) -> bool) -> bool { - self.peek().map(f).unwrap_or(false) - } -} - -impl<'s> CharParser<'s> { - /// Slice a part out of the source string. - pub fn get(&self, index: I) -> &'s str - where - I: SliceIndex, - { - &self.src[index] - } - - /// The full source string. - pub fn src(&self) -> &'s str { - self.src - } - - /// The full string up to the current index. - pub fn eaten(&self) -> &'s str { - &self.src[.. self.index] - } - - /// The string from `start` to the current index. - pub fn eaten_from(&self, start: usize) -> &'s str { - &self.src[start .. self.index] - } - - /// The remaining string after the current index. - pub fn rest(&self) -> &'s str { - &self.src[self.index ..] - } - - /// The current index in the string. - pub fn index(&self) -> usize { - self.index - } - - /// The previous index in the string. - pub fn prev_index(&self) -> usize { - self.src[.. self.index] - .chars() - .next_back() - .map(|c| self.index - c.len_utf8()) - .unwrap_or(0) - } - - /// Go back to the where the index says. - fn reset(&mut self) { - self.iter = self.src[self.index ..].chars(); - } -} - -impl Debug for CharParser<'_> { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "CharParser({}|{})", self.eaten(), self.rest()) - } -} - -/// Whether this character denotes a newline. -pub fn is_newline_char(character: char) -> bool { - match character { - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } -} diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 4d79c11b..8c879d12 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,11 +1,11 @@ //! Parsing and tokenization. -mod chars; mod resolve; +mod scanner; mod tokens; -pub use chars::*; pub use resolve::*; +pub use scanner::*; pub use tokens::*; use std::str::FromStr; diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index 422f9385..0e2ebd4b 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -1,41 +1,41 @@ //! Resolve strings and raw blocks. -use super::{is_newline_char, CharParser}; +use super::{is_newline_char, Scanner}; use crate::syntax::{Ident, Raw}; /// Resolves all escape sequences in a string. pub fn resolve_string(string: &str) -> String { let mut out = String::with_capacity(string.len()); - let mut p = CharParser::new(string); + let mut s = Scanner::new(string); - while let Some(c) = p.eat() { + while let Some(c) = s.eat() { if c != '\\' { out.push(c); continue; } - let start = p.prev_index(); - match p.eat() { + let start = s.prev_index(); + match s.eat() { Some('\\') => out.push('\\'), Some('"') => out.push('"'), Some('n') => out.push('\n'), Some('t') => out.push('\t'), - Some('u') if p.eat_if('{') => { + Some('u') if s.eat_if('{') => { // TODO: Feedback if closing brace is missing. - let sequence = p.eat_while(|c| c.is_ascii_hexdigit()); - let _terminated = p.eat_if('}'); + let sequence = s.eat_while(|c| c.is_ascii_hexdigit()); + let _terminated = s.eat_if('}'); if let Some(c) = resolve_hex(sequence) { out.push(c); } else { // TODO: Feedback that escape sequence is wrong. - out += p.eaten_from(start); + out += s.eaten_from(start); } } // TODO: Feedback about invalid escape sequence. - _ => out += p.eaten_from(start), + _ => out += s.eaten_from(start), } } @@ -69,10 +69,10 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> Raw { /// Parse the lang tag and return it alongside the remaining inner raw text. fn split_at_lang_tag(raw: &str) -> (&str, &str) { - let mut p = CharParser::new(raw); + let mut s = Scanner::new(raw); ( - p.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)), - p.rest(), + s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)), + s.rest(), ) } @@ -104,11 +104,11 @@ fn trim_and_split_raw(raw: &str) -> (Vec, bool) { /// Splits a string into a vector of lines (respecting Unicode & Windows line /// breaks). pub fn split_lines(text: &str) -> Vec { - let mut p = CharParser::new(text); + let mut s = Scanner::new(text); let mut line = String::new(); let mut lines = Vec::new(); - while let Some(c) = p.eat_merging_crlf() { + while let Some(c) = s.eat_merging_crlf() { if is_newline_char(c) { lines.push(std::mem::take(&mut line)); } else { diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs new file mode 100644 index 00000000..1bffc204 --- /dev/null +++ b/src/parse/scanner.rs @@ -0,0 +1,171 @@ +//! Low-level char-based scanner. + +use std::fmt::{self, Debug, Formatter}; +use std::slice::SliceIndex; +use std::str::Chars; + +/// A low-level featureful char scanner. +pub struct Scanner<'s> { + src: &'s str, + iter: Chars<'s>, + index: usize, +} + +impl<'s> Scanner<'s> { + /// Create a new char scanner. + pub fn new(src: &'s str) -> Self { + Self { src, iter: src.chars(), index: 0 } + } + + /// Consume the next char. + pub fn eat(&mut self) -> Option { + let next = self.iter.next(); + if let Some(c) = next { + self.index += c.len_utf8(); + } + next + } + + /// Consume the next char if it is the given one. + /// + /// Returns whether the char was consumed. + pub fn eat_if(&mut self, c: char) -> bool { + // Don't decode the char twice through peek() and eat(). + // + // TODO: Benchmark this vs. the naive version. + if self.iter.next() == Some(c) { + self.index += c.len_utf8(); + true + } else { + self.reset(); + false + } + } + + /// Consume the next char, debug-asserting that it is the given one. + pub fn eat_assert(&mut self, c: char) { + let next = self.eat(); + debug_assert_eq!(next, Some(c)); + } + + /// Consume the next char, coalescing `\r\n` to just `\n`. + pub fn eat_merging_crlf(&mut self) -> Option { + let c = self.eat(); + if c == Some('\r') && self.eat_if('\n') { + Some('\n') + } else { + c + } + } + + /// Eat chars while the condition is true. + pub fn eat_while(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str { + self.eat_until(|c| !f(c)) + } + + /// Eat chars until the condition is true. + pub fn eat_until(&mut self, mut f: impl FnMut(char) -> bool) -> &'s str { + let start = self.index; + while let Some(c) = self.iter.next() { + if f(c) { + // Undo the previous `next()` without peeking all the time + // during iteration. + // + // TODO: Benchmark this vs. the naive peeking version. + self.reset(); + break; + } + self.index += c.len_utf8(); + } + &self.src[start .. self.index] + } + + /// Uneat the last eaten character. + pub fn uneat(&mut self) { + self.index = self.prev_index(); + self.reset(); + } + + /// Peek at the next char without consuming it. + pub fn peek(&self) -> Option { + self.iter.clone().next() + } + + /// Peek at the nth-next char without consuming anything. + pub fn peek_nth(&self, n: usize) -> Option { + self.iter.clone().nth(n) + } + + /// Checks whether the next character fulfills a condition. + /// + /// Returns `false` is there is no next character. + pub fn check(&self, f: impl FnMut(char) -> bool) -> bool { + self.peek().map(f).unwrap_or(false) + } +} + +impl<'s> Scanner<'s> { + /// Slice a part out of the source string. + pub fn get(&self, index: I) -> &'s str + where + I: SliceIndex, + { + &self.src[index] + } + + /// The full source string. + pub fn src(&self) -> &'s str { + self.src + } + + /// The full string up to the current index. + pub fn eaten(&self) -> &'s str { + &self.src[.. self.index] + } + + /// The string from `start` to the current index. + pub fn eaten_from(&self, start: usize) -> &'s str { + &self.src[start .. self.index] + } + + /// The remaining string after the current index. + pub fn rest(&self) -> &'s str { + &self.src[self.index ..] + } + + /// The current index in the string. + pub fn index(&self) -> usize { + self.index + } + + /// The previous index in the string. + pub fn prev_index(&self) -> usize { + self.src[.. self.index] + .chars() + .next_back() + .map(|c| self.index - c.len_utf8()) + .unwrap_or(0) + } + + /// Go back to the where the index says. + fn reset(&mut self) { + self.iter = self.src[self.index ..].chars(); + } +} + +impl Debug for Scanner<'_> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "Scanner({}|{})", self.eaten(), self.rest()) + } +} + +/// Whether this character denotes a newline. +pub fn is_newline_char(character: char) -> bool { + match character { + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 2700b6c8..cdb92c59 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,6 +1,6 @@ //! Tokenization. -use super::{is_newline_char, CharParser}; +use super::{is_newline_char, Scanner}; use crate::length::Length; use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token}; @@ -9,7 +9,7 @@ use TokenMode::*; /// An iterator over the tokens of a string of source code. #[derive(Debug)] pub struct Tokens<'s> { - p: CharParser<'s>, + s: Scanner<'s>, mode: TokenMode, stack: Vec, } @@ -27,7 +27,7 @@ impl<'s> Tokens<'s> { /// Create a new token iterator with the given mode. pub fn new(src: &'s str, mode: TokenMode) -> Self { Self { - p: CharParser::new(src), + s: Scanner::new(src), mode, stack: vec![], } @@ -48,7 +48,7 @@ impl<'s> Tokens<'s> { /// The position in the string at which the last token ends and next token /// will start. pub fn pos(&self) -> Pos { - self.p.index().into() + self.s.index().into() } } @@ -57,15 +57,15 @@ impl<'s> Iterator for Tokens<'s> { /// Parse the next token in the source code. fn next(&mut self) -> Option { - let start = self.p.index(); - let token = match self.p.eat()? { + let start = self.s.index(); + let token = match self.s.eat()? { // Whitespace. c if c.is_whitespace() => self.read_whitespace(c), // Comments. - '/' if self.p.eat_if('/') => self.read_line_comment(), - '/' if self.p.eat_if('*') => self.read_block_comment(), - '*' if self.p.eat_if('/') => Token::Invalid("*/"), + '/' if self.s.eat_if('/') => self.read_line_comment(), + '/' if self.s.eat_if('*') => self.read_block_comment(), + '*' if self.s.eat_if('/') => Token::Invalid("*/"), // Functions. '[' => Token::LeftBracket, @@ -87,7 +87,7 @@ impl<'s> Iterator for Tokens<'s> { ':' if self.mode == Header => Token::Colon, ',' if self.mode == Header => Token::Comma, '=' if self.mode == Header => Token::Equals, - '>' if self.mode == Header && self.p.eat_if('>') => Token::Chain, + '>' if self.mode == Header && self.s.eat_if('>') => Token::Chain, // Expressions in headers. '+' if self.mode == Header => Token::Plus, @@ -101,7 +101,7 @@ impl<'s> Iterator for Tokens<'s> { _ => self.read_text_or_expr(start), }; - let end = self.p.index(); + let end = self.s.index(); Some(token.span_with(Span::new(start, end))) } } @@ -109,21 +109,21 @@ impl<'s> Iterator for Tokens<'s> { impl<'s> Tokens<'s> { fn read_whitespace(&mut self, first: char) -> Token<'s> { // Shortcut for common case of exactly one space. - if first == ' ' && !self.p.check(|c| c.is_whitespace()) { + if first == ' ' && !self.s.check(|c| c.is_whitespace()) { return Token::Space(0); } // Uneat the first char if it's a newline, so that it's counted in the // loop. if is_newline_char(first) { - self.p.uneat(); + self.s.uneat(); } // Count the number of newlines. let mut newlines = 0; - while let Some(c) = self.p.eat_merging_crlf() { + while let Some(c) = self.s.eat_merging_crlf() { if !c.is_whitespace() { - self.p.uneat(); + self.s.uneat(); break; } @@ -136,17 +136,17 @@ impl<'s> Tokens<'s> { } fn read_line_comment(&mut self) -> Token<'s> { - Token::LineComment(self.p.eat_until(is_newline_char)) + Token::LineComment(self.s.eat_until(is_newline_char)) } fn read_block_comment(&mut self) -> Token<'s> { - let start = self.p.index(); + let start = self.s.index(); let mut state = '_'; let mut depth = 1; // Find the first `*/` that does not correspond to a nested `/*`. - while let Some(c) = self.p.eat() { + while let Some(c) = self.s.eat() { state = match (state, c) { ('*', '/') => { depth -= 1; @@ -164,21 +164,21 @@ impl<'s> Tokens<'s> { } let terminated = depth == 0; - let end = self.p.index() - if terminated { 2 } else { 0 }; + let end = self.s.index() - if terminated { 2 } else { 0 }; - Token::BlockComment(self.p.get(start .. end)) + Token::BlockComment(self.s.get(start .. end)) } fn read_hex(&mut self) -> Token<'s> { // This parses more than the permissable 0-9, a-f, A-F character ranges // to provide nicer error messages later. - Token::Hex(self.p.eat_while(|c| c.is_ascii_alphanumeric())) + Token::Hex(self.s.eat_while(|c| c.is_ascii_alphanumeric())) } fn read_string(&mut self) -> Token<'s> { let mut escaped = false; Token::Str { - string: self.p.eat_until(|c| { + string: self.s.eat_until(|c| { if c == '"' && !escaped { true } else { @@ -186,21 +186,21 @@ impl<'s> Tokens<'s> { false } }), - terminated: self.p.eat_if('"'), + terminated: self.s.eat_if('"'), } } fn read_raw(&mut self) -> Token<'s> { let mut backticks = 1; - while self.p.eat_if('`') { + while self.s.eat_if('`') { backticks += 1; } - let start = self.p.index(); + let start = self.s.index(); let mut found = 0; while found < backticks { - match self.p.eat() { + match self.s.eat() { Some('`') => found += 1, Some(_) => found = 0, None => break, @@ -208,29 +208,29 @@ impl<'s> Tokens<'s> { } let terminated = found == backticks; - let end = self.p.index() - if terminated { found } else { 0 }; + let end = self.s.index() - if terminated { found } else { 0 }; Token::Raw { - raw: self.p.get(start .. end), + raw: self.s.get(start .. end), backticks, terminated, } } fn read_escaped(&mut self) -> Token<'s> { - if let Some(c) = self.p.peek() { + if let Some(c) = self.s.peek() { match c { '[' | ']' | '\\' | '/' | '*' | '_' | '`' | '"' | '#' | '~' => { - let start = self.p.index(); - self.p.eat_assert(c); - Token::Text(&self.p.eaten_from(start)) + let start = self.s.index(); + self.s.eat_assert(c); + Token::Text(&self.s.eaten_from(start)) } - 'u' if self.p.peek_nth(1) == Some('{') => { - self.p.eat_assert('u'); - self.p.eat_assert('{'); + 'u' if self.s.peek_nth(1) == Some('{') => { + self.s.eat_assert('u'); + self.s.eat_assert('{'); Token::UnicodeEscape { - sequence: self.p.eat_while(|c| c.is_ascii_hexdigit()), - terminated: self.p.eat_if('}'), + sequence: self.s.eat_while(|c| c.is_ascii_hexdigit()), + terminated: self.s.eat_if('}'), } } c if c.is_whitespace() => Token::Backslash, @@ -246,7 +246,7 @@ impl<'s> Tokens<'s> { let header = self.mode == Header; let mut last_was_e = false; - self.p.eat_until(|c| { + self.s.eat_until(|c| { let end = match c { c if c.is_whitespace() => true, '[' | ']' | '*' | '/' => true, @@ -259,7 +259,7 @@ impl<'s> Tokens<'s> { end }); - let read = self.p.eaten_from(start); + let read = self.s.eaten_from(start); if self.mode == Header { parse_expr(read) } else { diff --git a/src/syntax/lines.rs b/src/syntax/lines.rs index 7f7ee049..6ea223c4 100644 --- a/src/syntax/lines.rs +++ b/src/syntax/lines.rs @@ -3,7 +3,7 @@ use std::fmt::{self, Debug, Display, Formatter}; use super::Pos; -use crate::parse::{is_newline_char, CharParser}; +use crate::parse::{is_newline_char, Scanner}; /// Enables conversion of byte position to locations. pub struct LineMap<'s> { @@ -15,11 +15,11 @@ impl<'s> LineMap<'s> { /// Create a new line map for a source string. pub fn new(src: &'s str) -> Self { let mut line_starts = vec![Pos::ZERO]; - let mut p = CharParser::new(src); + let mut s = Scanner::new(src); - while let Some(c) = p.eat_merging_crlf() { + while let Some(c) = s.eat_merging_crlf() { if is_newline_char(c) { - line_starts.push(p.index().into()); + line_starts.push(s.index().into()); } } -- cgit v1.2.3