summaryrefslogtreecommitdiff
path: root/src/parse
diff options
context:
space:
mode:
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/mod.rs2
-rw-r--r--src/parse/parser.rs23
-rw-r--r--src/parse/resolve.rs20
-rw-r--r--src/parse/scanner.rs211
-rw-r--r--src/parse/tokens.rs135
5 files changed, 104 insertions, 287 deletions
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 47cba111..7536b2ca 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -3,13 +3,11 @@
mod incremental;
mod parser;
mod resolve;
-mod scanner;
mod tokens;
pub use incremental::*;
pub use parser::*;
pub use resolve::*;
-pub use scanner::*;
pub use tokens::*;
use std::collections::HashSet;
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
index 98adfba2..4bbbdc28 100644
--- a/src/parse/parser.rs
+++ b/src/parse/parser.rs
@@ -1,6 +1,6 @@
-use core::slice::SliceIndex;
use std::fmt::{self, Display, Formatter};
use std::mem;
+use std::ops::Range;
use super::{TokenMode, Tokens};
use crate::syntax::{ErrorPos, Green, GreenData, GreenNode, NodeKind};
@@ -116,7 +116,7 @@ impl<'s> Parser<'s> {
_ => false,
};
- self.prev_end = self.tokens.index();
+ self.prev_end = self.tokens.cursor();
self.bump();
if self.tokens.mode() == TokenMode::Code {
@@ -186,15 +186,12 @@ impl<'s> Parser<'s> {
/// Peek at the source of the current token.
pub fn peek_src(&self) -> &'s str {
- self.tokens.scanner().get(self.current_start() .. self.current_end())
+ self.get(self.current_start() .. self.current_end())
}
/// Obtain a range of the source code.
- pub fn get<I>(&self, index: I) -> &'s str
- where
- I: SliceIndex<str, Output = str>,
- {
- self.tokens.scanner().get(index)
+ pub fn get(&self, range: Range<usize>) -> &'s str {
+ self.tokens.scanner().get(range)
}
/// The byte index at which the last non-trivia token ended.
@@ -209,7 +206,7 @@ impl<'s> Parser<'s> {
/// The byte index at which the current token ends.
pub fn current_end(&self) -> usize {
- self.tokens.index()
+ self.tokens.cursor()
}
/// Determine the column index for the given byte index.
@@ -294,8 +291,8 @@ impl<'s> Parser<'s> {
}
self.tokens.jump(target);
- self.prev_end = self.tokens.index();
- self.current_start = self.tokens.index();
+ self.prev_end = self.tokens.cursor();
+ self.current_start = self.tokens.cursor();
self.current = self.tokens.next();
}
@@ -311,9 +308,9 @@ impl<'s> Parser<'s> {
/// handling.
fn bump(&mut self) {
let kind = self.current.take().unwrap();
- let len = self.tokens.index() - self.current_start;
+ let len = self.tokens.cursor() - self.current_start;
self.children.push(GreenData::new(kind, len).into());
- self.current_start = self.tokens.index();
+ self.current_start = self.tokens.cursor();
self.current = self.tokens.next();
}
diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs
index 0d4cf071..dd9ed4f4 100644
--- a/src/parse/resolve.rs
+++ b/src/parse/resolve.rs
@@ -1,4 +1,6 @@
-use super::{is_ident, is_newline, Scanner};
+use unscanny::Scanner;
+
+use super::{is_ident, is_newline};
use crate::syntax::ast::RawNode;
use crate::util::EcoString;
@@ -13,7 +15,7 @@ pub fn resolve_string(string: &str) -> EcoString {
continue;
}
- let start = s.last_index();
+ let start = s.locate(-1);
match s.eat() {
Some('\\') => out.push('\\'),
Some('"') => out.push('"'),
@@ -22,17 +24,17 @@ pub fn resolve_string(string: &str) -> EcoString {
Some('t') => out.push('\t'),
Some('u') if s.eat_if('{') => {
// TODO: Feedback if closing brace is missing.
- let sequence = s.eat_while(|c| c.is_ascii_hexdigit());
+ let sequence = s.eat_while(char::is_ascii_hexdigit);
let _terminated = s.eat_if('}');
match resolve_hex(sequence) {
Some(c) => out.push(c),
- None => out.push_str(s.eaten_from(start)),
+ None => out.push_str(s.from(start)),
}
}
// TODO: Feedback about invalid escape sequence.
- _ => out.push_str(s.eaten_from(start)),
+ _ => out.push_str(s.from(start)),
}
}
@@ -68,8 +70,8 @@ pub fn resolve_raw(column: usize, backticks: usize, text: &str) -> RawNode {
fn split_at_lang_tag(raw: &str) -> (&str, &str) {
let mut s = Scanner::new(raw);
(
- s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)),
- s.rest(),
+ s.eat_until(|c: char| c == '`' || c.is_whitespace() || is_newline(c)),
+ s.after(),
)
}
@@ -129,9 +131,9 @@ fn split_lines(text: &str) -> Vec<&str> {
}
lines.push(&text[start .. end]);
- start = s.index();
+ start = s.cursor();
}
- end = s.index();
+ end = s.cursor();
}
lines.push(&text[start ..]);
diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs
deleted file mode 100644
index e4cf56e9..00000000
--- a/src/parse/scanner.rs
+++ /dev/null
@@ -1,211 +0,0 @@
-use std::slice::SliceIndex;
-
-use unicode_xid::UnicodeXID;
-
-/// A featureful char-based scanner.
-#[derive(Copy, Clone)]
-pub struct Scanner<'s> {
- /// The string to scan.
- src: &'s str,
- /// The index at which the peekable character starts. Must be in bounds and
- /// at a codepoint boundary to guarantee safety.
- index: usize,
-}
-
-impl<'s> Scanner<'s> {
- /// Create a new char scanner.
- #[inline]
- pub fn new(src: &'s str) -> Self {
- Self { src, index: 0 }
- }
-
- /// Whether the end of the string is reached.
- pub fn eof(&self) -> bool {
- self.index == self.src.len()
- }
-
- /// Consume the next char.
- #[inline]
- pub fn eat(&mut self) -> Option<char> {
- let next = self.peek();
- if let Some(c) = next {
- self.index += c.len_utf8();
- }
- next
- }
-
- /// Consume the next char if it is the given one.
- ///
- /// Returns whether the char was consumed.
- #[inline]
- pub fn eat_if(&mut self, c: char) -> bool {
- let matches = self.peek() == Some(c);
- if matches {
- self.index += c.len_utf8();
- }
- matches
- }
-
- /// Consume the next char, debug-asserting that it is the given one.
- #[inline]
- pub fn eat_assert(&mut self, c: char) {
- let next = self.eat();
- debug_assert_eq!(next, Some(c));
- }
-
- /// Eat chars while the condition is true.
- #[inline]
- pub fn eat_while<F>(&mut self, mut f: F) -> &'s str
- where
- F: FnMut(char) -> bool,
- {
- self.eat_until(|c| !f(c))
- }
-
- /// Eat chars until the condition is true.
- #[inline]
- pub fn eat_until<F>(&mut self, mut f: F) -> &'s str
- where
- F: FnMut(char) -> bool,
- {
- let start = self.index;
- while let Some(c) = self.peek() {
- if f(c) {
- break;
- }
- self.index += c.len_utf8();
- }
- self.eaten_from(start)
- }
-
- /// Uneat the last eaten char.
- #[inline]
- pub fn uneat(&mut self) {
- self.index = self.last_index();
- }
-
- /// Peek at the next char without consuming it.
- #[inline]
- pub fn peek(&self) -> Option<char> {
- self.rest().chars().next()
- }
-
- /// Get the nth-previous eaten char.
- #[inline]
- pub fn prev(&self, n: usize) -> Option<char> {
- self.eaten().chars().nth_back(n)
- }
-
- /// Checks whether the next char fulfills a condition.
- ///
- /// Returns `default` if there is no next char.
- #[inline]
- pub fn check_or<F>(&self, default: bool, f: F) -> bool
- where
- F: FnOnce(char) -> bool,
- {
- self.peek().map_or(default, f)
- }
-
- /// The previous index in the source string.
- #[inline]
- pub fn last_index(&self) -> usize {
- self.eaten().chars().last().map_or(0, |c| self.index - c.len_utf8())
- }
-
- /// The current index in the source string.
- #[inline]
- pub fn index(&self) -> usize {
- self.index
- }
-
- /// Jump to an index in the source string.
- #[inline]
- pub fn jump(&mut self, index: usize) {
- // Make sure that the index is in bounds and on a codepoint boundary.
- self.src.get(index ..).expect("jumped to invalid index");
- self.index = index;
- }
-
- /// The full source string.
- #[inline]
- pub fn src(&self) -> &'s str {
- self.src
- }
-
- /// Slice out part of the source string.
- #[inline]
- pub fn get<I>(&self, index: I) -> &'s str
- where
- I: SliceIndex<str, Output = str>,
- {
- // See `eaten_from` for details about `unwrap_or_default`.
- self.src.get(index).unwrap_or_default()
- }
-
- /// The remaining source string after the current index.
- #[inline]
- pub fn rest(&self) -> &'s str {
- // Safety: The index is always in bounds and on a codepoint boundary
- // since it starts at zero and is is:
- // - either increased by the length of a scanned character, advacing
- // from one codepoint boundary to the next,
- // - or checked upon jumping.
- unsafe { self.src.get_unchecked(self.index ..) }
- }
-
- /// The full source string up to the current index.
- #[inline]
- pub fn eaten(&self) -> &'s str {
- // Safety: The index is always okay, for details see `rest()`.
- unsafe { self.src.get_unchecked(.. self.index) }
- }
-
- /// The source string from `start` to the current index.
- #[inline]
- pub fn eaten_from(&self, start: usize) -> &'s str {
- // Using `unwrap_or_default` is much faster than unwrap, probably
- // because then the whole call to `eaten_from` is pure and can be
- // optimized away in some cases.
- self.src.get(start .. self.index).unwrap_or_default()
- }
-}
-
-/// Whether this character denotes a newline.
-#[inline]
-pub fn is_newline(character: char) -> bool {
- matches!(
- character,
- // Line Feed, Vertical Tab, Form Feed, Carriage Return.
- '\n' | '\x0B' | '\x0C' | '\r' |
- // Next Line, Line Separator, Paragraph Separator.
- '\u{0085}' | '\u{2028}' | '\u{2029}'
- )
-}
-
-/// Whether a string is a valid unicode identifier.
-///
-/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
-/// - `_` as a starting character,
-/// - `_` and `-` as continuing characters.
-///
-/// [uax31]: http://www.unicode.org/reports/tr31/
-#[inline]
-pub fn is_ident(string: &str) -> bool {
- let mut chars = string.chars();
- chars
- .next()
- .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
-}
-
-/// Whether a character can start an identifier.
-#[inline]
-pub fn is_id_start(c: char) -> bool {
- c.is_xid_start() || c == '_'
-}
-
-/// Whether a character can continue an identifier.
-#[inline]
-pub fn is_id_continue(c: char) -> bool {
- c.is_xid_continue() || c == '_' || c == '-'
-}
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index 053a7f61..ae3d7b9c 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -1,9 +1,9 @@
use std::sync::Arc;
-use super::{
- is_id_continue, is_id_start, is_newline, resolve_hex, resolve_raw, resolve_string,
- Scanner,
-};
+use unicode_xid::UnicodeXID;
+use unscanny::Scanner;
+
+use super::{resolve_hex, resolve_raw, resolve_string};
use crate::geom::{AngleUnit, LengthUnit};
use crate::syntax::ast::{MathNode, RawNode, Unit};
use crate::syntax::{ErrorPos, NodeKind};
@@ -65,13 +65,11 @@ impl<'s> Tokens<'s> {
/// The index in the string at which the last token ends and next token
/// will start.
#[inline]
- pub fn index(&self) -> usize {
- self.s.index()
+ pub fn cursor(&self) -> usize {
+ self.s.cursor()
}
/// Jump to the given index in the string.
- ///
- /// You need to know the correct column.
#[inline]
pub fn jump(&mut self, index: usize) {
self.s.jump(index);
@@ -92,7 +90,7 @@ impl<'s> Tokens<'s> {
/// The column index of a given index in the source string.
#[inline]
pub fn column(&self, index: usize) -> usize {
- column(self.s.src(), index, self.column_offset)
+ column(self.s.string(), index, self.column_offset)
}
}
@@ -102,7 +100,7 @@ impl<'s> Iterator for Tokens<'s> {
/// Parse the next token in the source code.
#[inline]
fn next(&mut self) -> Option<Self::Item> {
- let start = self.s.index();
+ let start = self.s.cursor();
let c = self.s.eat()?;
Some(match c {
// Blocks.
@@ -112,15 +110,13 @@ impl<'s> Iterator for Tokens<'s> {
']' => NodeKind::RightBracket,
// Whitespace.
- ' ' if self.s.check_or(true, |c| !c.is_whitespace()) => NodeKind::Space(0),
+ ' ' if self.s.done() || !self.s.at(char::is_whitespace) => NodeKind::Space(0),
c if c.is_whitespace() => self.whitespace(),
// Comments with special case for URLs.
'/' if self.s.eat_if('*') => self.block_comment(),
'/' if !self.maybe_in_url() && self.s.eat_if('/') => self.line_comment(),
- '*' if self.s.eat_if('/') => {
- NodeKind::Unknown(self.s.eaten_from(start).into())
- }
+ '*' if self.s.eat_if('/') => NodeKind::Unknown(self.s.from(start).into()),
// Other things.
_ => match self.mode {
@@ -187,22 +183,20 @@ impl<'s> Tokens<'s> {
'=' => NodeKind::Eq,
'<' => NodeKind::Lt,
'>' => NodeKind::Gt,
- '.' if self.s.check_or(true, |n| !n.is_ascii_digit()) => NodeKind::Dot,
+ '.' if self.s.done() || !self.s.at(char::is_ascii_digit) => NodeKind::Dot,
// Identifiers.
c if is_id_start(c) => self.ident(start),
// Numbers.
- c if c.is_ascii_digit()
- || (c == '.' && self.s.check_or(false, |n| n.is_ascii_digit())) =>
- {
+ c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
self.number(start, c)
}
// Strings.
'"' => self.string(),
- _ => NodeKind::Unknown(self.s.eaten_from(start).into()),
+ _ => NodeKind::Unknown(self.s.from(start).into()),
}
}
@@ -226,19 +220,19 @@ impl<'s> Tokens<'s> {
};
loop {
- self.s.eat_until(|c| {
+ self.s.eat_until(|c: char| {
TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
});
let mut s = self.s;
- if !(s.eat_if(' ') && s.check_or(false, char::is_alphanumeric)) {
+ if !(s.eat_if(' ') && s.at(char::is_alphanumeric)) {
break;
}
self.s.eat();
}
- NodeKind::Text(self.s.eaten_from(start).into())
+ NodeKind::Text(self.s.from(start).into())
}
fn whitespace(&mut self) -> NodeKind {
@@ -276,13 +270,11 @@ impl<'s> Tokens<'s> {
'[' | ']' | '{' | '}' | '#' |
// Markup.
'~' | '\'' | '"' | '*' | '_' | '`' | '$' | '=' | '-' | '.' => {
- self.s.eat_assert(c) ;
+ self.s.expect(c);
NodeKind::Escape(c)
}
- 'u' if self.s.rest().starts_with("u{") => {
- self.s.eat_assert('u');
- self.s.eat_assert('{');
- let sequence = self.s.eat_while(|c| c.is_ascii_alphanumeric());
+ 'u' if self.s.eat_if("u{") => {
+ let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
if self.s.eat_if('}') {
if let Some(c) = resolve_hex(sequence) {
NodeKind::Escape(c)
@@ -304,7 +296,7 @@ impl<'s> Tokens<'s> {
// Linebreaks.
c if c.is_whitespace() => NodeKind::Linebreak(false),
'+' => {
- self.s.eat_assert(c);
+ self.s.expect(c);
NodeKind::Linebreak(true)
}
@@ -315,7 +307,7 @@ impl<'s> Tokens<'s> {
#[inline]
fn hash(&mut self) -> NodeKind {
- if self.s.check_or(false, is_id_start) {
+ if self.s.at(is_id_start) {
let read = self.s.eat_while(is_id_continue);
match keyword(read) {
Some(keyword) => keyword,
@@ -342,10 +334,10 @@ impl<'s> Tokens<'s> {
fn numbering(&mut self, start: usize, c: char) -> NodeKind {
let number = if c != '.' {
- self.s.eat_while(|c| c.is_ascii_digit());
- let read = self.s.eaten_from(start);
+ self.s.eat_while(char::is_ascii_digit);
+ let read = self.s.from(start);
if !self.s.eat_if('.') {
- return NodeKind::Text(self.s.eaten_from(start).into());
+ return NodeKind::Text(self.s.from(start).into());
}
read.parse().ok()
} else {
@@ -356,7 +348,7 @@ impl<'s> Tokens<'s> {
}
fn raw(&mut self) -> NodeKind {
- let column = self.column(self.s.index() - 1);
+ let column = self.column(self.s.cursor() - 1);
let mut backticks = 1;
while self.s.eat_if('`') {
@@ -372,7 +364,7 @@ impl<'s> Tokens<'s> {
}));
}
- let start = self.s.index();
+ let start = self.s.cursor();
let mut found = 0;
while found < backticks {
@@ -384,7 +376,7 @@ impl<'s> Tokens<'s> {
}
if found == backticks {
- let end = self.s.index() - found as usize;
+ let end = self.s.cursor() - found as usize;
NodeKind::Raw(Arc::new(resolve_raw(
column,
backticks,
@@ -412,7 +404,7 @@ impl<'s> Tokens<'s> {
display = true;
}
- let start = self.s.index();
+ let start = self.s.cursor();
let mut escaped = false;
let mut dollar = !display;
@@ -429,7 +421,7 @@ impl<'s> Tokens<'s> {
}
};
- let end = self.s.index()
+ let end = self.s.cursor()
- match (terminated, display) {
(false, _) => 0,
(true, false) => 1,
@@ -456,7 +448,7 @@ impl<'s> Tokens<'s> {
fn ident(&mut self, start: usize) -> NodeKind {
self.s.eat_while(is_id_continue);
- match self.s.eaten_from(start) {
+ match self.s.from(start) {
"none" => NodeKind::None,
"auto" => NodeKind::Auto,
"true" => NodeKind::Bool(true),
@@ -467,30 +459,29 @@ impl<'s> Tokens<'s> {
fn number(&mut self, start: usize, c: char) -> NodeKind {
// Read the first part (integer or fractional depending on `first`).
- self.s.eat_while(|c| c.is_ascii_digit());
+ self.s.eat_while(char::is_ascii_digit);
// Read the fractional part if not already done.
// Make sure not to confuse a range for the decimal separator.
- if c != '.' && !self.s.rest().starts_with("..") && self.s.eat_if('.') {
- self.s.eat_while(|c| c.is_ascii_digit());
+ if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
+ self.s.eat_while(char::is_ascii_digit);
}
// Read the exponent.
- let em = self.s.rest().starts_with("em");
- if !em && self.s.eat_if('e') || self.s.eat_if('E') {
- let _ = self.s.eat_if('+') || self.s.eat_if('-');
- self.s.eat_while(|c| c.is_ascii_digit());
+ if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
+ self.s.eat_if(['+', '-']);
+ self.s.eat_while(char::is_ascii_digit);
}
// Read the suffix.
- let suffix_start = self.s.index();
+ let suffix_start = self.s.cursor();
if !self.s.eat_if('%') {
- self.s.eat_while(|c| c.is_ascii_alphanumeric());
+ self.s.eat_while(char::is_ascii_alphanumeric);
}
let number = self.s.get(start .. suffix_start);
- let suffix = self.s.eaten_from(suffix_start);
- let all = self.s.eaten_from(start);
+ let suffix = self.s.from(suffix_start);
+ let all = self.s.from(start);
// Find out whether it is a simple number.
if suffix.is_empty() {
@@ -575,13 +566,13 @@ impl<'s> Tokens<'s> {
fn in_word(&self) -> bool {
let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
- let prev = self.s.prev(1);
+ let prev = self.s.scout(-2);
let next = self.s.peek();
alphanumeric(prev) && alphanumeric(next)
}
fn maybe_in_url(&self) -> bool {
- self.mode == TokenMode::Markup && self.s.eaten().ends_with(":/")
+ self.mode == TokenMode::Markup && self.s.before().ends_with(":/")
}
}
@@ -610,7 +601,8 @@ fn keyword(ident: &str) -> Option<NodeKind> {
})
}
-/// The column index of a given index in the source string, given a column offset for the first line.
+/// The column index of a given index in the source string, given a column
+/// offset for the first line.
#[inline]
fn column(string: &str, index: usize, offset: usize) -> usize {
let mut apply_offset = false;
@@ -634,6 +626,45 @@ fn column(string: &str, index: usize, offset: usize) -> usize {
if apply_offset { res + offset } else { res }
}
+/// Whether this character denotes a newline.
+#[inline]
+pub fn is_newline(character: char) -> bool {
+ matches!(
+ character,
+ // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+ '\n' | '\x0B' | '\x0C' | '\r' |
+ // Next Line, Line Separator, Paragraph Separator.
+ '\u{0085}' | '\u{2028}' | '\u{2029}'
+ )
+}
+
+/// Whether a string is a valid unicode identifier.
+///
+/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
+/// - `_` as a starting character,
+/// - `_` and `-` as continuing characters.
+///
+/// [uax31]: http://www.unicode.org/reports/tr31/
+#[inline]
+pub fn is_ident(string: &str) -> bool {
+ let mut chars = string.chars();
+ chars
+ .next()
+ .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
+}
+
+/// Whether a character can start an identifier.
+#[inline]
+pub fn is_id_start(c: char) -> bool {
+ c.is_xid_start() || c == '_'
+}
+
+/// Whether a character can continue an identifier.
+#[inline]
+pub fn is_id_continue(c: char) -> bool {
+ c.is_xid_continue() || c == '_' || c == '-'
+}
+
#[cfg(test)]
#[allow(non_snake_case)]
mod tests {