summaryrefslogtreecommitdiff
path: root/src/syntax/lexer.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-01-14 12:34:17 +0100
committerLaurenz <laurmaedje@gmail.com>2023-01-14 12:34:17 +0100
commitab03f3224078f1b8ca05bc1b65a7df4bebb5f449 (patch)
tree968c8fff18863187d77fe0ce26a726aeddbca195 /src/syntax/lexer.rs
parent2589692b00d40fd5094ea2d5b9448e2cfc052045 (diff)
Reorganize syntax module
Diffstat (limited to 'src/syntax/lexer.rs')
-rw-r--r--src/syntax/lexer.rs769
1 files changed, 769 insertions, 0 deletions
diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs
new file mode 100644
index 00000000..d5476774
--- /dev/null
+++ b/src/syntax/lexer.rs
@@ -0,0 +1,769 @@
+use std::num::NonZeroUsize;
+use std::sync::Arc;
+
+use unicode_xid::UnicodeXID;
+use unscanny::Scanner;
+
+use super::resolve::{resolve_hex, resolve_raw, resolve_string};
+use super::{ErrorPos, RawFields, SyntaxKind, Unit};
+use crate::geom::{AbsUnit, AngleUnit};
+use crate::util::{format_eco, EcoString};
+
+/// Splits up a string of source code into tokens.
+#[derive(Clone)]
+pub struct Lexer<'s> {
+ /// The underlying scanner.
+ s: Scanner<'s>,
+ /// The mode the lexer is in. This determines what tokens it recognizes.
+ mode: LexMode,
+ /// Whether the last token has been terminated.
+ terminated: bool,
+ /// Offsets the indentation on the first line of the source.
+ column_offset: usize,
+}
+
+/// What kind of tokens to emit.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum LexMode {
+ /// Text and markup.
+ Markup,
+ /// Math atoms, operators, etc.
+ Math,
+ /// Keywords, literals and operators.
+ Code,
+}
+
+impl<'s> Lexer<'s> {
+ /// Create a new lexer with the given mode.
+ pub fn new(text: &'s str, mode: LexMode) -> Self {
+ Self::with_prefix("", text, mode)
+ }
+
+ /// Create a new lexer with the given mode and a prefix to offset column
+ /// calculations.
+ pub fn with_prefix(prefix: &str, text: &'s str, mode: LexMode) -> Self {
+ Self {
+ s: Scanner::new(text),
+ mode,
+ terminated: true,
+ column_offset: column(prefix, prefix.len(), 0),
+ }
+ }
+
+ /// Get the current lexing mode.
+ pub fn mode(&self) -> LexMode {
+ self.mode
+ }
+
+ /// Change the lexing mode.
+ pub fn set_mode(&mut self, mode: LexMode) {
+ self.mode = mode;
+ }
+
+ /// The index in the string at which the last token ends and next token
+ /// will start.
+ pub fn cursor(&self) -> usize {
+ self.s.cursor()
+ }
+
+ /// Jump to the given index in the string.
+ pub fn jump(&mut self, index: usize) {
+ self.s.jump(index);
+ }
+
+ /// The underlying scanner.
+ pub fn scanner(&self) -> Scanner<'s> {
+ self.s
+ }
+
+ /// Whether the last token was terminated.
+ pub fn terminated(&self) -> bool {
+ self.terminated
+ }
+
+ /// The column index of a given index in the source string.
+ pub fn column(&self, index: usize) -> usize {
+ column(self.s.string(), index, self.column_offset)
+ }
+}
+
+impl Iterator for Lexer<'_> {
+ type Item = SyntaxKind;
+
+ /// Produce the next token.
+ fn next(&mut self) -> Option<Self::Item> {
+ let start = self.s.cursor();
+ let c = self.s.eat()?;
+ Some(match c {
+ // Trivia.
+ '/' if self.s.eat_if('/') => self.line_comment(),
+ '/' if self.s.eat_if('*') => self.block_comment(),
+ '*' if self.s.eat_if('/') => SyntaxKind::Error(
+ ErrorPos::Full,
+ "unexpected end of block comment".into(),
+ ),
+ c if c.is_whitespace() => self.whitespace(c),
+
+ // Other things.
+ _ => match self.mode {
+ LexMode::Markup => self.markup(start, c),
+ LexMode::Math => self.math(start, c),
+ LexMode::Code => self.code(start, c),
+ },
+ })
+ }
+}
+
+/// Shared.
+impl Lexer<'_> {
+ fn line_comment(&mut self) -> SyntaxKind {
+ self.s.eat_until(is_newline);
+ if self.s.peek().is_none() {
+ self.terminated = false;
+ }
+ SyntaxKind::LineComment
+ }
+
+ fn block_comment(&mut self) -> SyntaxKind {
+ let mut state = '_';
+ let mut depth = 1;
+ self.terminated = false;
+
+ // Find the first `*/` that does not correspond to a nested `/*`.
+ while let Some(c) = self.s.eat() {
+ state = match (state, c) {
+ ('*', '/') => {
+ depth -= 1;
+ if depth == 0 {
+ self.terminated = true;
+ break;
+ }
+ '_'
+ }
+ ('/', '*') => {
+ depth += 1;
+ '_'
+ }
+ ('/', '/') => {
+ self.line_comment();
+ '_'
+ }
+ _ => c,
+ }
+ }
+
+ SyntaxKind::BlockComment
+ }
+
+ fn whitespace(&mut self, c: char) -> SyntaxKind {
+ if c == ' ' && !self.s.at(char::is_whitespace) {
+ return SyntaxKind::Space { newlines: 0 };
+ }
+
+ self.s.uneat();
+
+ // Count the number of newlines.
+ let mut newlines = 0;
+ while let Some(c) = self.s.eat() {
+ if !c.is_whitespace() {
+ self.s.uneat();
+ break;
+ }
+
+ if is_newline(c) {
+ if c == '\r' {
+ self.s.eat_if('\n');
+ }
+ newlines += 1;
+ }
+ }
+
+ SyntaxKind::Space { newlines }
+ }
+}
+
+impl Lexer<'_> {
+ fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ // Blocks.
+ '{' => SyntaxKind::LeftBrace,
+ '}' => SyntaxKind::RightBrace,
+ '[' => SyntaxKind::LeftBracket,
+ ']' => SyntaxKind::RightBracket,
+
+ // Multi-char things.
+ '#' => self.hash(start),
+ '.' if self.s.eat_if("..") => SyntaxKind::Shorthand('\u{2026}'),
+ '-' => self.hyph(),
+ ':' => self.colon(),
+ 'h' if self.s.eat_if("ttp://") || self.s.eat_if("ttps://") => {
+ self.link(start)
+ }
+ '`' => self.raw(),
+ c if c.is_ascii_digit() => self.numbering(start),
+ '<' if self.s.at(is_id_continue) => self.label(),
+ '@' if self.s.at(is_id_continue) => self.reference(),
+
+ // Escape sequences.
+ '\\' => self.backslash(),
+
+ // Single-char things.
+ '~' => SyntaxKind::Shorthand('\u{00A0}'),
+ '\'' => SyntaxKind::SmartQuote { double: false },
+ '"' => SyntaxKind::SmartQuote { double: true },
+ '*' if !self.in_word() => SyntaxKind::Star,
+ '_' if !self.in_word() => SyntaxKind::Underscore,
+ '$' => SyntaxKind::Dollar,
+ '=' => SyntaxKind::Eq,
+ '+' => SyntaxKind::Plus,
+ '/' => SyntaxKind::Slash,
+
+ // Plain text.
+ _ => self.text(start),
+ }
+ }
+
+ fn text(&mut self, start: usize) -> SyntaxKind {
+ macro_rules! table {
+ ($(|$c:literal)*) => {{
+ let mut t = [false; 128];
+ $(t[$c as usize] = true;)*
+ t
+ }}
+ }
+
+ const TABLE: [bool; 128] = table! {
+ | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
+ | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
+ | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
+ };
+
+ loop {
+ self.s.eat_until(|c: char| {
+ TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
+ });
+
+ // Continue with the same text node if the thing would become text
+ // anyway.
+ let mut s = self.s;
+ match s.eat() {
+ Some('/') if !s.at(['/', '*']) => {}
+ Some(' ') if s.at(char::is_alphanumeric) => {}
+ Some('-') if !s.at(['-', '?']) => {}
+ Some('.') if !s.at("..") => {}
+ Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
+ Some('@' | '#') if !s.at(is_id_start) => {}
+ _ => break,
+ }
+
+ self.s = s;
+ }
+
+ SyntaxKind::Text(self.s.from(start).into())
+ }
+
+ fn backslash(&mut self) -> SyntaxKind {
+ match self.s.peek() {
+ Some('u') if self.s.eat_if("u{") => {
+ let sequence = self.s.eat_while(char::is_ascii_alphanumeric);
+ if self.s.eat_if('}') {
+ if let Some(c) = resolve_hex(sequence) {
+ SyntaxKind::Escape(c)
+ } else {
+ SyntaxKind::Error(
+ ErrorPos::Full,
+ "invalid unicode escape sequence".into(),
+ )
+ }
+ } else {
+ self.terminated = false;
+ SyntaxKind::Error(ErrorPos::End, "expected closing brace".into())
+ }
+ }
+
+ // Linebreaks.
+ Some(c) if c.is_whitespace() => SyntaxKind::Linebreak,
+ None => SyntaxKind::Linebreak,
+
+ // Escapes.
+ Some(c) => {
+ self.s.expect(c);
+ SyntaxKind::Escape(c)
+ }
+ }
+ }
+
+ fn hash(&mut self, start: usize) -> SyntaxKind {
+ if self.s.eat_if('{') {
+ SyntaxKind::LeftBrace
+ } else if self.s.eat_if('[') {
+ SyntaxKind::LeftBracket
+ } else if self.s.at(is_id_start) {
+ let read = self.s.eat_while(is_id_continue);
+ match keyword(read) {
+ Some(keyword) => keyword,
+ None => SyntaxKind::Ident(read.into()),
+ }
+ } else if self.mode == LexMode::Markup {
+ self.text(start)
+ } else {
+ SyntaxKind::Atom("#".into())
+ }
+ }
+
+ fn hyph(&mut self) -> SyntaxKind {
+ if self.s.eat_if('-') {
+ if self.s.eat_if('-') {
+ SyntaxKind::Shorthand('\u{2014}')
+ } else {
+ SyntaxKind::Shorthand('\u{2013}')
+ }
+ } else if self.s.eat_if('?') {
+ SyntaxKind::Shorthand('\u{00AD}')
+ } else {
+ SyntaxKind::Minus
+ }
+ }
+
+ fn colon(&mut self) -> SyntaxKind {
+ let start = self.s.cursor();
+ let mut end = start;
+ while !self.s.eat_while(char::is_ascii_alphanumeric).is_empty() && self.s.at(':')
+ {
+ end = self.s.cursor();
+ self.s.eat();
+ }
+
+ self.s.jump(end);
+
+ if start < end {
+ self.s.expect(':');
+ SyntaxKind::Symbol(self.s.get(start..end).into())
+ } else if self.mode == LexMode::Markup {
+ SyntaxKind::Colon
+ } else {
+ SyntaxKind::Atom(":".into())
+ }
+ }
+
+ fn link(&mut self, start: usize) -> SyntaxKind {
+ #[rustfmt::skip]
+ self.s.eat_while(|c: char| matches!(c,
+ | '0' ..= '9'
+ | 'a' ..= 'z'
+ | 'A' ..= 'Z'
+ | '~' | '/' | '%' | '?' | '#' | '&' | '+' | '='
+ | '\'' | '.' | ',' | ';'
+ ));
+ if self.s.scout(-1) == Some('.') {
+ self.s.uneat();
+ }
+ SyntaxKind::Link(self.s.from(start).into())
+ }
+
+ fn raw(&mut self) -> SyntaxKind {
+ let column = self.column(self.s.cursor() - 1);
+
+ let mut backticks = 1;
+ while self.s.eat_if('`') {
+ backticks += 1;
+ }
+
+ // Special case for empty inline block.
+ if backticks == 2 {
+ return SyntaxKind::Raw(Arc::new(RawFields {
+ text: EcoString::new(),
+ lang: None,
+ block: false,
+ }));
+ }
+
+ let start = self.s.cursor();
+ let mut found = 0;
+ while found < backticks {
+ match self.s.eat() {
+ Some('`') => found += 1,
+ Some(_) => found = 0,
+ None => break,
+ }
+ }
+
+ if found == backticks {
+ let end = self.s.cursor() - found as usize;
+ SyntaxKind::Raw(Arc::new(resolve_raw(
+ column,
+ backticks,
+ self.s.get(start..end),
+ )))
+ } else {
+ self.terminated = false;
+ let remaining = backticks - found;
+ let noun = if remaining == 1 { "backtick" } else { "backticks" };
+ SyntaxKind::Error(
+ ErrorPos::End,
+ if found == 0 {
+ format_eco!("expected {} {}", remaining, noun)
+ } else {
+ format_eco!("expected {} more {}", remaining, noun)
+ },
+ )
+ }
+ }
+
+ fn numbering(&mut self, start: usize) -> SyntaxKind {
+ self.s.eat_while(char::is_ascii_digit);
+ let read = self.s.from(start);
+ if self.s.eat_if('.') {
+ if let Ok(number) = read.parse::<usize>() {
+ return match NonZeroUsize::new(number) {
+ Some(number) => SyntaxKind::EnumNumbering(number),
+ None => SyntaxKind::Error(ErrorPos::Full, "must be positive".into()),
+ };
+ }
+ }
+
+ self.text(start)
+ }
+
+ fn reference(&mut self) -> SyntaxKind {
+ SyntaxKind::Ref(self.s.eat_while(is_id_continue).into())
+ }
+
+ fn in_word(&self) -> bool {
+ let alphanumeric = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+ let prev = self.s.scout(-2);
+ let next = self.s.peek();
+ alphanumeric(prev) && alphanumeric(next)
+ }
+}
+
+/// Math.
+impl Lexer<'_> {
+ fn math(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ // Symbol shorthands.
+ '|' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{21A6}'),
+ '<' if self.s.eat_if("->") => SyntaxKind::Shorthand('\u{2194}'),
+ '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand('\u{21D4}'),
+ '!' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2260}'),
+ '<' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2264}'),
+ '>' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2265}'),
+ '<' if self.s.eat_if('-') => SyntaxKind::Shorthand('\u{2190}'),
+ '-' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{2192}'),
+ '=' if self.s.eat_if('>') => SyntaxKind::Shorthand('\u{21D2}'),
+ ':' if self.s.eat_if('=') => SyntaxKind::Shorthand('\u{2254}'),
+
+ // Multi-char things.
+ '#' => self.hash(start),
+
+ // Escape sequences.
+ '\\' => self.backslash(),
+
+ // Single-char things.
+ '_' => SyntaxKind::Underscore,
+ '^' => SyntaxKind::Hat,
+ '/' => SyntaxKind::Slash,
+ '&' => SyntaxKind::Amp,
+ '$' => SyntaxKind::Dollar,
+
+ // Symbol notation.
+ ':' => self.colon(),
+
+ // Strings.
+ '"' => self.string(),
+
+ // Identifiers and symbol notation.
+ c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
+ self.s.eat_while(is_math_id_continue);
+
+ let mut symbol = false;
+ while self.s.eat_if(':')
+ && !self.s.eat_while(char::is_alphanumeric).is_empty()
+ {
+ symbol = true;
+ }
+
+ if symbol {
+ SyntaxKind::Symbol(self.s.from(start).into())
+ } else {
+ if self.s.scout(-1) == Some(':') {
+ self.s.uneat();
+ }
+
+ SyntaxKind::Ident(self.s.from(start).into())
+ }
+ }
+
+ // Numbers.
+ c if c.is_numeric() => {
+ self.s.eat_while(char::is_numeric);
+ SyntaxKind::Atom(self.s.from(start).into())
+ }
+
+ // Other math atoms.
+ c => SyntaxKind::Atom(c.into()),
+ }
+ }
+}
+
+/// Code.
+impl Lexer<'_> {
+ fn code(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ // Blocks.
+ '{' => SyntaxKind::LeftBrace,
+ '}' => SyntaxKind::RightBrace,
+ '[' => SyntaxKind::LeftBracket,
+ ']' => SyntaxKind::RightBracket,
+
+ // Parentheses.
+ '(' => SyntaxKind::LeftParen,
+ ')' => SyntaxKind::RightParen,
+
+ // Math.
+ '$' => SyntaxKind::Dollar,
+
+ // Labels and raw.
+ '<' if self.s.at(is_id_continue) => self.label(),
+ '`' => self.raw(),
+
+ // Two-char operators.
+ '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
+ '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
+ '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
+ '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
+ '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
+ '-' if self.s.eat_if('=') => SyntaxKind::HyphEq,
+ '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
+ '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
+ '.' if self.s.eat_if('.') => SyntaxKind::Dots,
+ '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
+
+ // Single-char operators.
+ ',' => SyntaxKind::Comma,
+ ';' => SyntaxKind::Semicolon,
+ ':' => SyntaxKind::Colon,
+ '+' => SyntaxKind::Plus,
+ '-' => SyntaxKind::Minus,
+ '*' => SyntaxKind::Star,
+ '/' => SyntaxKind::Slash,
+ '=' => SyntaxKind::Eq,
+ '<' => SyntaxKind::Lt,
+ '>' => SyntaxKind::Gt,
+ '.' if !self.s.at(char::is_ascii_digit) => SyntaxKind::Dot,
+
+ // Identifiers.
+ c if is_id_start(c) => self.ident(start),
+
+ // Numbers.
+ c if c.is_ascii_digit() || (c == '.' && self.s.at(char::is_ascii_digit)) => {
+ self.number(start, c)
+ }
+
+ // Strings.
+ '"' => self.string(),
+
+ // Invalid token.
+ _ => SyntaxKind::Error(ErrorPos::Full, "not valid here".into()),
+ }
+ }
+
+ fn ident(&mut self, start: usize) -> SyntaxKind {
+ self.s.eat_while(is_id_continue);
+ match self.s.from(start) {
+ "none" => SyntaxKind::None,
+ "auto" => SyntaxKind::Auto,
+ "true" => SyntaxKind::Bool(true),
+ "false" => SyntaxKind::Bool(false),
+ id => keyword(id).unwrap_or_else(|| SyntaxKind::Ident(id.into())),
+ }
+ }
+
+ fn number(&mut self, start: usize, c: char) -> SyntaxKind {
+ // Read the first part (integer or fractional depending on `first`).
+ self.s.eat_while(char::is_ascii_digit);
+
+ // Read the fractional part if not already done.
+ // Make sure not to confuse a range for the decimal separator.
+ if c != '.' && !self.s.at("..") && self.s.eat_if('.') {
+ self.s.eat_while(char::is_ascii_digit);
+ }
+
+ // Read the exponent.
+ if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
+ self.s.eat_if(['+', '-']);
+ self.s.eat_while(char::is_ascii_digit);
+ }
+
+ // Read the suffix.
+ let suffix_start = self.s.cursor();
+ if !self.s.eat_if('%') {
+ self.s.eat_while(char::is_ascii_alphanumeric);
+ }
+
+ let number = self.s.get(start..suffix_start);
+ let suffix = self.s.from(suffix_start);
+
+ // Find out whether it is a simple number.
+ if suffix.is_empty() {
+ if let Ok(i) = number.parse::<i64>() {
+ return SyntaxKind::Int(i);
+ }
+ }
+
+ let Ok(v) = number.parse::<f64>() else {
+ return SyntaxKind::Error(ErrorPos::Full, "invalid number".into());
+ };
+
+ match suffix {
+ "" => SyntaxKind::Float(v),
+ "pt" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Pt)),
+ "mm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Mm)),
+ "cm" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::Cm)),
+ "in" => SyntaxKind::Numeric(v, Unit::Length(AbsUnit::In)),
+ "deg" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Deg)),
+ "rad" => SyntaxKind::Numeric(v, Unit::Angle(AngleUnit::Rad)),
+ "em" => SyntaxKind::Numeric(v, Unit::Em),
+ "fr" => SyntaxKind::Numeric(v, Unit::Fr),
+ "%" => SyntaxKind::Numeric(v, Unit::Percent),
+ _ => SyntaxKind::Error(ErrorPos::Full, "invalid number suffix".into()),
+ }
+ }
+
+ fn string(&mut self) -> SyntaxKind {
+ let mut escaped = false;
+ let verbatim = self.s.eat_until(|c| {
+ if c == '"' && !escaped {
+ true
+ } else {
+ escaped = c == '\\' && !escaped;
+ false
+ }
+ });
+
+ let string = resolve_string(verbatim);
+ if self.s.eat_if('"') {
+ SyntaxKind::Str(string)
+ } else {
+ self.terminated = false;
+ SyntaxKind::Error(ErrorPos::End, "expected quote".into())
+ }
+ }
+
+ fn label(&mut self) -> SyntaxKind {
+ let label = self.s.eat_while(is_id_continue);
+ if self.s.eat_if('>') {
+ if !label.is_empty() {
+ SyntaxKind::Label(label.into())
+ } else {
+ SyntaxKind::Error(ErrorPos::Full, "label cannot be empty".into())
+ }
+ } else {
+ self.terminated = false;
+ SyntaxKind::Error(ErrorPos::End, "expected closing angle bracket".into())
+ }
+ }
+}
+
+/// Try to parse an identifier into a keyword.
+fn keyword(ident: &str) -> Option<SyntaxKind> {
+ Some(match ident {
+ "not" => SyntaxKind::Not,
+ "and" => SyntaxKind::And,
+ "or" => SyntaxKind::Or,
+ "let" => SyntaxKind::Let,
+ "set" => SyntaxKind::Set,
+ "show" => SyntaxKind::Show,
+ "if" => SyntaxKind::If,
+ "else" => SyntaxKind::Else,
+ "for" => SyntaxKind::For,
+ "in" => SyntaxKind::In,
+ "while" => SyntaxKind::While,
+ "break" => SyntaxKind::Break,
+ "continue" => SyntaxKind::Continue,
+ "return" => SyntaxKind::Return,
+ "import" => SyntaxKind::Import,
+ "include" => SyntaxKind::Include,
+ "as" => SyntaxKind::As,
+ _ => return None,
+ })
+}
+
+/// The column index of a given index in the source string, given a column
+/// offset for the first line.
+fn column(string: &str, index: usize, offset: usize) -> usize {
+ let mut apply_offset = false;
+ let res = string[..index]
+ .char_indices()
+ .rev()
+ .take_while(|&(_, c)| !is_newline(c))
+ .inspect(|&(i, _)| {
+ if i == 0 {
+ apply_offset = true
+ }
+ })
+ .count();
+
+ // The loop is never executed if the slice is empty, but we are of
+ // course still at the start of the first line.
+ if index == 0 {
+ apply_offset = true;
+ }
+
+ if apply_offset {
+ res + offset
+ } else {
+ res
+ }
+}
+
+/// Whether this character denotes a newline.
+#[inline]
+pub fn is_newline(character: char) -> bool {
+ matches!(
+ character,
+ // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+ '\n' | '\x0B' | '\x0C' | '\r' |
+ // Next Line, Line Separator, Paragraph Separator.
+ '\u{0085}' | '\u{2028}' | '\u{2029}'
+ )
+}
+
+/// Whether a string is a valid unicode identifier.
+///
+/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
+/// - `_` as a starting character,
+/// - `_` and `-` as continuing characters.
+///
+/// [uax31]: http://www.unicode.org/reports/tr31/
+#[inline]
+pub fn is_ident(string: &str) -> bool {
+ let mut chars = string.chars();
+ chars
+ .next()
+ .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
+}
+
+/// Whether a character can start an identifier.
+#[inline]
+fn is_id_start(c: char) -> bool {
+ c.is_xid_start() || c == '_'
+}
+
+/// Whether a character can continue an identifier.
+#[inline]
+fn is_id_continue(c: char) -> bool {
+ c.is_xid_continue() || c == '_' || c == '-'
+}
+
+/// Whether a character can start an identifier in math.
+#[inline]
+fn is_math_id_start(c: char) -> bool {
+ c.is_xid_start()
+}
+
+/// Whether a character can continue an identifier in math.
+#[inline]
+fn is_math_id_continue(c: char) -> bool {
+ c.is_xid_continue() && c != '_'
+}