summaryrefslogtreecommitdiff
path: root/crates/typst-syntax/src/lexer.rs
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2023-07-18 20:11:31 +0200
committerLaurenz <laurmaedje@gmail.com>2023-07-18 21:04:46 +0200
commitf5953887c9ae0b40a0c3e0ab516daf425c5a598c (patch)
treeb517ca68517e49bdf458bfa92036a8ff855c72f6 /crates/typst-syntax/src/lexer.rs
parent7dc605307cf7d69a3476b8b6fc4786f683c3289b (diff)
Extract syntax module into typst-syntax crate
Diffstat (limited to 'crates/typst-syntax/src/lexer.rs')
-rw-r--r--crates/typst-syntax/src/lexer.rs739
1 files changed, 739 insertions, 0 deletions
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
new file mode 100644
index 00000000..b96b3c07
--- /dev/null
+++ b/crates/typst-syntax/src/lexer.rs
@@ -0,0 +1,739 @@
+use ecow::{eco_format, EcoString};
+use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_segmentation::UnicodeSegmentation;
+use unscanny::Scanner;
+
+use super::SyntaxKind;
+
+/// Splits up a string of source code into tokens.
+#[derive(Clone)]
+pub(super) struct Lexer<'s> {
+ /// The underlying scanner.
+ s: Scanner<'s>,
+ /// The mode the lexer is in. This determines which kinds of tokens it
+ /// produces.
+ mode: LexMode,
+ /// Whether the last token contained a newline.
+ newline: bool,
+ /// An error for the last token.
+ error: Option<EcoString>,
+}
+
+/// What kind of tokens to emit.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub(super) enum LexMode {
+ /// Text and markup.
+ Markup,
+ /// Math atoms, operators, etc.
+ Math,
+ /// Keywords, literals and operators.
+ Code,
+}
+
+impl<'s> Lexer<'s> {
+ /// Create a new lexer with the given mode and a prefix to offset column
+ /// calculations.
+ pub fn new(text: &'s str, mode: LexMode) -> Self {
+ Self {
+ s: Scanner::new(text),
+ mode,
+ newline: false,
+ error: None,
+ }
+ }
+
+ /// Get the current lexing mode.
+ pub fn mode(&self) -> LexMode {
+ self.mode
+ }
+
+ /// Change the lexing mode.
+ pub fn set_mode(&mut self, mode: LexMode) {
+ self.mode = mode;
+ }
+
+ /// The index in the string at which the last token ends and next token
+ /// will start.
+ pub fn cursor(&self) -> usize {
+ self.s.cursor()
+ }
+
+ /// Jump to the given index in the string.
+ pub fn jump(&mut self, index: usize) {
+ self.s.jump(index);
+ }
+
+ /// Whether the last token contained a newline.
+ pub fn newline(&self) -> bool {
+ self.newline
+ }
+
+ /// Take out the last error, if any.
+ pub fn take_error(&mut self) -> Option<EcoString> {
+ self.error.take()
+ }
+}
+
+impl Lexer<'_> {
+ /// Construct a full-positioned syntax error.
+ fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
+ self.error = Some(message.into());
+ SyntaxKind::Error
+ }
+}
+
+/// Shared.
+impl Lexer<'_> {
+ pub fn next(&mut self) -> SyntaxKind {
+ self.newline = false;
+ self.error = None;
+ let start = self.s.cursor();
+ match self.s.eat() {
+ Some(c) if c.is_whitespace() => self.whitespace(start, c),
+ Some('/') if self.s.eat_if('/') => self.line_comment(),
+ Some('/') if self.s.eat_if('*') => self.block_comment(),
+ Some('*') if self.s.eat_if('/') => {
+ self.error("unexpected end of block comment")
+ }
+
+ Some(c) => match self.mode {
+ LexMode::Markup => self.markup(start, c),
+ LexMode::Math => self.math(start, c),
+ LexMode::Code => self.code(start, c),
+ },
+
+ None => SyntaxKind::Eof,
+ }
+ }
+
+ fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
+ let more = self.s.eat_while(char::is_whitespace);
+ let newlines = match c {
+ ' ' if more.is_empty() => 0,
+ _ => count_newlines(self.s.from(start)),
+ };
+
+ self.newline = newlines > 0;
+ if self.mode == LexMode::Markup && newlines >= 2 {
+ SyntaxKind::Parbreak
+ } else {
+ SyntaxKind::Space
+ }
+ }
+
+ fn line_comment(&mut self) -> SyntaxKind {
+ self.s.eat_until(is_newline);
+ SyntaxKind::LineComment
+ }
+
+ fn block_comment(&mut self) -> SyntaxKind {
+ let mut state = '_';
+ let mut depth = 1;
+
+ // Find the first `*/` that does not correspond to a nested `/*`.
+ while let Some(c) = self.s.eat() {
+ state = match (state, c) {
+ ('*', '/') => {
+ depth -= 1;
+ if depth == 0 {
+ break;
+ }
+ '_'
+ }
+ ('/', '*') => {
+ depth += 1;
+ '_'
+ }
+ ('/', '/') => {
+ self.line_comment();
+ '_'
+ }
+ _ => c,
+ }
+ }
+
+ SyntaxKind::BlockComment
+ }
+}
+
+/// Markup.
+impl Lexer<'_> {
+ fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ '\\' => self.backslash(),
+ '`' => self.raw(),
+ 'h' if self.s.eat_if("ttp://") => self.link(),
+ 'h' if self.s.eat_if("ttps://") => self.link(),
+ '<' if self.s.at(is_id_continue) => self.label(),
+ '@' => self.ref_marker(),
+
+ '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
+ '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
+ '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
+ '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
+ '*' if !self.in_word() => SyntaxKind::Star,
+ '_' if !self.in_word() => SyntaxKind::Underscore,
+
+ '#' => SyntaxKind::Hashtag,
+ '[' => SyntaxKind::LeftBracket,
+ ']' => SyntaxKind::RightBracket,
+ '\'' => SyntaxKind::SmartQuote,
+ '"' => SyntaxKind::SmartQuote,
+ '$' => SyntaxKind::Dollar,
+ '~' => SyntaxKind::Shorthand,
+ ':' => SyntaxKind::Colon,
+ '=' => {
+ self.s.eat_while('=');
+ if self.space_or_end() {
+ SyntaxKind::HeadingMarker
+ } else {
+ self.text()
+ }
+ }
+ '-' if self.space_or_end() => SyntaxKind::ListMarker,
+ '+' if self.space_or_end() => SyntaxKind::EnumMarker,
+ '/' if self.space_or_end() => SyntaxKind::TermMarker,
+ '0'..='9' => self.numbering(start),
+
+ _ => self.text(),
+ }
+ }
+
+ fn backslash(&mut self) -> SyntaxKind {
+ if self.s.eat_if("u{") {
+ let hex = self.s.eat_while(char::is_ascii_alphanumeric);
+ if !self.s.eat_if('}') {
+ return self.error("unclosed Unicode escape sequence");
+ }
+
+ if u32::from_str_radix(hex, 16)
+ .ok()
+ .and_then(std::char::from_u32)
+ .is_none()
+ {
+ return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
+ }
+
+ return SyntaxKind::Escape;
+ }
+
+ if self.s.done() || self.s.at(char::is_whitespace) {
+ SyntaxKind::Linebreak
+ } else {
+ self.s.eat();
+ SyntaxKind::Escape
+ }
+ }
+
+ fn raw(&mut self) -> SyntaxKind {
+ let mut backticks = 1;
+ while self.s.eat_if('`') {
+ backticks += 1;
+ }
+
+ if backticks == 2 {
+ return SyntaxKind::Raw;
+ }
+
+ let mut found = 0;
+ while found < backticks {
+ match self.s.eat() {
+ Some('`') => found += 1,
+ Some(_) => found = 0,
+ None => break,
+ }
+ }
+
+ if found != backticks {
+ return self.error("unclosed raw text");
+ }
+
+ SyntaxKind::Raw
+ }
+
+ fn link(&mut self) -> SyntaxKind {
+ let mut brackets = Vec::new();
+
+ #[rustfmt::skip]
+ self.s.eat_while(|c: char| {
+ match c {
+ | '0' ..= '9'
+ | 'a' ..= 'z'
+ | 'A' ..= 'Z'
+ | '!' | '#' | '$' | '%' | '&' | '*' | '+'
+ | ',' | '-' | '.' | '/' | ':' | ';' | '='
+ | '?' | '@' | '_' | '~' | '\'' => true,
+ '[' => {
+ brackets.push(SyntaxKind::LeftBracket);
+ true
+ }
+ '(' => {
+ brackets.push(SyntaxKind::LeftParen);
+ true
+ }
+ ']' => brackets.pop() == Some(SyntaxKind::LeftBracket),
+ ')' => brackets.pop() == Some(SyntaxKind::LeftParen),
+ _ => false,
+ }
+ });
+
+ if !brackets.is_empty() {
+ return self.error(
+ "automatic links cannot contain unbalanced brackets, \
+ use the `link` function instead",
+ );
+ }
+
+ // Don't include the trailing characters likely to be part of text.
+ while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
+ self.s.uneat();
+ }
+
+ SyntaxKind::Link
+ }
+
+ fn numbering(&mut self, start: usize) -> SyntaxKind {
+ self.s.eat_while(char::is_ascii_digit);
+
+ let read = self.s.from(start);
+ if self.s.eat_if('.') && self.space_or_end() && read.parse::<usize>().is_ok() {
+ return SyntaxKind::EnumMarker;
+ }
+
+ self.text()
+ }
+
+ fn ref_marker(&mut self) -> SyntaxKind {
+ self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.'));
+
+ // Don't include the trailing characters likely to be part of text.
+ while matches!(self.s.scout(-1), Some('.' | ':')) {
+ self.s.uneat();
+ }
+
+ SyntaxKind::RefMarker
+ }
+
+ fn label(&mut self) -> SyntaxKind {
+ let label = self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.'));
+ if label.is_empty() {
+ return self.error("label cannot be empty");
+ }
+
+ if !self.s.eat_if('>') {
+ return self.error("unclosed label");
+ }
+
+ SyntaxKind::Label
+ }
+
+ fn text(&mut self) -> SyntaxKind {
+ macro_rules! table {
+ ($(|$c:literal)*) => {
+ static TABLE: [bool; 128] = {
+ let mut t = [false; 128];
+ $(t[$c as usize] = true;)*
+ t
+ };
+ };
+ }
+
+ table! {
+ | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
+ | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
+ | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
+ };
+
+ loop {
+ self.s.eat_until(|c: char| {
+ TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
+ });
+
+ // Continue with the same text node if the thing would become text
+ // anyway.
+ let mut s = self.s;
+ match s.eat() {
+ Some(' ') if s.at(char::is_alphanumeric) => {}
+ Some('/') if !s.at(['/', '*']) => {}
+ Some('-') if !s.at(['-', '?']) => {}
+ Some('.') if !s.at("..") => {}
+ Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
+ Some('@') if !s.at(is_id_start) => {}
+ _ => break,
+ }
+
+ self.s = s;
+ }
+
+ SyntaxKind::Text
+ }
+
+ fn in_word(&self) -> bool {
+ let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+ let prev = self.s.scout(-2);
+ let next = self.s.peek();
+ alphanum(prev) && alphanum(next)
+ }
+
+ fn space_or_end(&self) -> bool {
+ self.s.done() || self.s.at(char::is_whitespace)
+ }
+}
+
+/// Math.
+impl Lexer<'_> {
+ fn math(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ '\\' => self.backslash(),
+ '"' => self.string(),
+
+ '-' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
+ '-' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+ '-' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+ ':' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+ ':' if self.s.eat_if(":=") => SyntaxKind::Shorthand,
+ '!' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+ '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
+ '[' if self.s.eat_if('|') => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("==>") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("-->") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("--") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("-<") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("<-") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("<<") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("==") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if("~~") => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if('<') => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if('-') => SyntaxKind::Shorthand,
+ '<' if self.s.eat_if('~') => SyntaxKind::Shorthand,
+ '>' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+ '>' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
+ '=' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+ '=' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+ '=' if self.s.eat_if(':') => SyntaxKind::Shorthand,
+ '>' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+ '>' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+ '|' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+ '|' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+ '|' if self.s.eat_if(']') => SyntaxKind::Shorthand,
+ '|' if self.s.eat_if('|') => SyntaxKind::Shorthand,
+ '~' if self.s.eat_if("~>") => SyntaxKind::Shorthand,
+ '~' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+ '*' | '-' => SyntaxKind::Shorthand,
+
+ '#' => SyntaxKind::Hashtag,
+ '_' => SyntaxKind::Underscore,
+ '$' => SyntaxKind::Dollar,
+ '/' => SyntaxKind::Slash,
+ '^' => SyntaxKind::Hat,
+ '\'' => SyntaxKind::Prime,
+ '&' => SyntaxKind::MathAlignPoint,
+ '√' | '∛' | '∜' => SyntaxKind::Root,
+
+ // Identifiers.
+ c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
+ self.s.eat_while(is_math_id_continue);
+ SyntaxKind::MathIdent
+ }
+
+ // Other math atoms.
+ _ => self.math_text(start, c),
+ }
+ }
+
+ fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
+ // Keep numbers and grapheme clusters together.
+ if c.is_numeric() {
+ self.s.eat_while(char::is_numeric);
+ let mut s = self.s;
+ if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
+ self.s = s;
+ }
+ } else {
+ let len = self
+ .s
+ .get(start..self.s.string().len())
+ .graphemes(true)
+ .next()
+ .map_or(0, str::len);
+ self.s.jump(start + len);
+ }
+ SyntaxKind::Text
+ }
+}
+
+/// Code.
+impl Lexer<'_> {
+ fn code(&mut self, start: usize, c: char) -> SyntaxKind {
+ match c {
+ '`' => self.raw(),
+ '<' if self.s.at(is_id_continue) => self.label(),
+ '0'..='9' => self.number(start, c),
+ '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
+ '"' => self.string(),
+
+ '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
+ '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
+ '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
+ '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
+ '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
+ '-' if self.s.eat_if('=') => SyntaxKind::HyphEq,
+ '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
+ '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
+ '.' if self.s.eat_if('.') => SyntaxKind::Dots,
+ '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
+
+ '{' => SyntaxKind::LeftBrace,
+ '}' => SyntaxKind::RightBrace,
+ '[' => SyntaxKind::LeftBracket,
+ ']' => SyntaxKind::RightBracket,
+ '(' => SyntaxKind::LeftParen,
+ ')' => SyntaxKind::RightParen,
+ '$' => SyntaxKind::Dollar,
+ ',' => SyntaxKind::Comma,
+ ';' => SyntaxKind::Semicolon,
+ ':' => SyntaxKind::Colon,
+ '.' => SyntaxKind::Dot,
+ '+' => SyntaxKind::Plus,
+ '-' => SyntaxKind::Minus,
+ '*' => SyntaxKind::Star,
+ '/' => SyntaxKind::Slash,
+ '=' => SyntaxKind::Eq,
+ '<' => SyntaxKind::Lt,
+ '>' => SyntaxKind::Gt,
+
+ c if is_id_start(c) => self.ident(start),
+
+ c => self.error(eco_format!("the character `{c}` is not valid in code")),
+ }
+ }
+
+ fn ident(&mut self, start: usize) -> SyntaxKind {
+ self.s.eat_while(is_id_continue);
+ let ident = self.s.from(start);
+
+ let prev = self.s.get(0..start);
+ if !prev.ends_with(['.', '@']) || prev.ends_with("..") {
+ if let Some(keyword) = keyword(ident) {
+ return keyword;
+ }
+ }
+
+ if ident == "_" {
+ SyntaxKind::Underscore
+ } else {
+ SyntaxKind::Ident
+ }
+ }
+
+ fn number(&mut self, mut start: usize, c: char) -> SyntaxKind {
+ // Handle alternative integer bases.
+ let mut base = 10;
+ if c == '0' {
+ if self.s.eat_if('b') {
+ base = 2;
+ } else if self.s.eat_if('o') {
+ base = 8;
+ } else if self.s.eat_if('x') {
+ base = 16;
+ }
+ if base != 10 {
+ start = self.s.cursor();
+ }
+ }
+
+ // Read the first part (integer or fractional depending on `first`).
+ self.s.eat_while(if base == 16 {
+ char::is_ascii_alphanumeric
+ } else {
+ char::is_ascii_digit
+ });
+
+ // Read the fractional part if not already done.
+ // Make sure not to confuse a range for the decimal separator.
+ if c != '.'
+ && !self.s.at("..")
+ && !self.s.scout(1).map_or(false, is_id_start)
+ && self.s.eat_if('.')
+ && base == 10
+ {
+ self.s.eat_while(char::is_ascii_digit);
+ }
+
+ // Read the exponent.
+ if !self.s.at("em") && self.s.eat_if(['e', 'E']) && base == 10 {
+ self.s.eat_if(['+', '-']);
+ self.s.eat_while(char::is_ascii_digit);
+ }
+
+ // Read the suffix.
+ let suffix_start = self.s.cursor();
+ if !self.s.eat_if('%') {
+ self.s.eat_while(char::is_ascii_alphanumeric);
+ }
+
+ let number = self.s.get(start..suffix_start);
+ let suffix = self.s.from(suffix_start);
+
+ let kind = if i64::from_str_radix(number, base).is_ok() {
+ SyntaxKind::Int
+ } else if base == 10 && number.parse::<f64>().is_ok() {
+ SyntaxKind::Float
+ } else {
+ return self.error(match base {
+ 2 => eco_format!("invalid binary number: 0b{}", number),
+ 8 => eco_format!("invalid octal number: 0o{}", number),
+ 16 => eco_format!("invalid hexadecimal number: 0x{}", number),
+ _ => eco_format!("invalid number: {}", number),
+ });
+ };
+
+ if suffix.is_empty() {
+ return kind;
+ }
+
+ if !matches!(
+ suffix,
+ "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%"
+ ) {
+ return self.error(eco_format!("invalid number suffix: {}", suffix));
+ }
+
+ SyntaxKind::Numeric
+ }
+
+ fn string(&mut self) -> SyntaxKind {
+ let mut escaped = false;
+ self.s.eat_until(|c| {
+ let stop = c == '"' && !escaped;
+ escaped = c == '\\' && !escaped;
+ stop
+ });
+
+ if !self.s.eat_if('"') {
+ return self.error("unclosed string");
+ }
+
+ SyntaxKind::Str
+ }
+}
+
+/// Try to parse an identifier into a keyword.
+fn keyword(ident: &str) -> Option<SyntaxKind> {
+ Some(match ident {
+ "none" => SyntaxKind::None,
+ "auto" => SyntaxKind::Auto,
+ "true" => SyntaxKind::Bool,
+ "false" => SyntaxKind::Bool,
+ "not" => SyntaxKind::Not,
+ "and" => SyntaxKind::And,
+ "or" => SyntaxKind::Or,
+ "let" => SyntaxKind::Let,
+ "set" => SyntaxKind::Set,
+ "show" => SyntaxKind::Show,
+ "if" => SyntaxKind::If,
+ "else" => SyntaxKind::Else,
+ "for" => SyntaxKind::For,
+ "in" => SyntaxKind::In,
+ "while" => SyntaxKind::While,
+ "break" => SyntaxKind::Break,
+ "continue" => SyntaxKind::Continue,
+ "return" => SyntaxKind::Return,
+ "import" => SyntaxKind::Import,
+ "include" => SyntaxKind::Include,
+ "as" => SyntaxKind::As,
+ _ => return None,
+ })
+}
+
+/// Whether a character is interpreted as a newline by Typst.
+#[inline]
+pub fn is_newline(character: char) -> bool {
+ matches!(
+ character,
+ // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+ '\n' | '\x0B' | '\x0C' | '\r' |
+ // Next Line, Line Separator, Paragraph Separator.
+ '\u{0085}' | '\u{2028}' | '\u{2029}'
+ )
+}
+
+/// Split text at newlines.
+pub(super) fn split_newlines(text: &str) -> Vec<&str> {
+ let mut s = Scanner::new(text);
+ let mut lines = Vec::new();
+ let mut start = 0;
+ let mut end = 0;
+
+ while let Some(c) = s.eat() {
+ if is_newline(c) {
+ if c == '\r' {
+ s.eat_if('\n');
+ }
+
+ lines.push(&text[start..end]);
+ start = s.cursor();
+ }
+ end = s.cursor();
+ }
+
+ lines.push(&text[start..]);
+ lines
+}
+
+/// Count the number of newlines in text.
+fn count_newlines(text: &str) -> usize {
+ let mut newlines = 0;
+ let mut s = Scanner::new(text);
+ while let Some(c) = s.eat() {
+ if is_newline(c) {
+ if c == '\r' {
+ s.eat_if('\n');
+ }
+ newlines += 1;
+ }
+ }
+ newlines
+}
+
+/// Whether a string is a valid Typst identifier.
+///
+/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
+/// - `_` as a starting character,
+/// - `_` and `-` as continuing characters.
+///
+/// [uax31]: http://www.unicode.org/reports/tr31/
+#[inline]
+pub fn is_ident(string: &str) -> bool {
+ let mut chars = string.chars();
+ chars
+ .next()
+ .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
+}
+
+/// Whether a character can start an identifier.
+#[inline]
+pub fn is_id_start(c: char) -> bool {
+ is_xid_start(c) || c == '_'
+}
+
+/// Whether a character can continue an identifier.
+#[inline]
+pub fn is_id_continue(c: char) -> bool {
+ is_xid_continue(c) || c == '_' || c == '-'
+}
+
+/// Whether a character can start an identifier in math.
+#[inline]
+fn is_math_id_start(c: char) -> bool {
+ is_xid_start(c)
+}
+
+/// Whether a character can continue an identifier in math.
+#[inline]
+fn is_math_id_continue(c: char) -> bool {
+ is_xid_continue(c) && c != '_'
+}