Extract syntax module into typst-syntax crate

author: Laurenz <laurmaedje@gmail.com> 2023-07-18 20:11:31 +0200
committer: Laurenz <laurmaedje@gmail.com> 2023-07-18 21:04:46 +0200
commit: f5953887c9ae0b40a0c3e0ab516daf425c5a598c (patch)
tree: b517ca68517e49bdf458bfa92036a8ff855c72f6 /crates/typst-syntax/src/lexer.rs
parent: 7dc605307cf7d69a3476b8b6fc4786f683c3289b (diff)
1 files changed, 739 insertions, 0 deletions
diff --git a/crates/typst-syntax/src/lexer.rs b/crates/typst-syntax/src/lexer.rs
new file mode 100644
index 00000000..b96b3c07
--- /dev/null
+++ b/crates/typst-syntax/src/lexer.rs
@@ -0,0 +1,739 @@
+use ecow::{eco_format, EcoString};
+use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_segmentation::UnicodeSegmentation;
+use unscanny::Scanner;
+
+use super::SyntaxKind;
+
+/// Splits up a string of source code into tokens.
+#[derive(Clone)]
+pub(super) struct Lexer<'s> {
+    /// The underlying scanner.
+    s: Scanner<'s>,
+    /// The mode the lexer is in. This determines which kinds of tokens it
+    /// produces.
+    mode: LexMode,
+    /// Whether the last token contained a newline.
+    newline: bool,
+    /// An error for the last token.
+    error: Option<EcoString>,
+}
+
+/// What kind of tokens to emit.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub(super) enum LexMode {
+    /// Text and markup.
+    Markup,
+    /// Math atoms, operators, etc.
+    Math,
+    /// Keywords, literals and operators.
+    Code,
+}
+
+impl<'s> Lexer<'s> {
+    /// Create a new lexer with the given mode and a prefix to offset column
+    /// calculations.
+    pub fn new(text: &'s str, mode: LexMode) -> Self {
+        Self {
+            s: Scanner::new(text),
+            mode,
+            newline: false,
+            error: None,
+        }
+    }
+
+    /// Get the current lexing mode.
+    pub fn mode(&self) -> LexMode {
+        self.mode
+    }
+
+    /// Change the lexing mode.
+    pub fn set_mode(&mut self, mode: LexMode) {
+        self.mode = mode;
+    }
+
+    /// The index in the string at which the last token ends and next token
+    /// will start.
+    pub fn cursor(&self) -> usize {
+        self.s.cursor()
+    }
+
+    /// Jump to the given index in the string.
+    pub fn jump(&mut self, index: usize) {
+        self.s.jump(index);
+    }
+
+    /// Whether the last token contained a newline.
+    pub fn newline(&self) -> bool {
+        self.newline
+    }
+
+    /// Take out the last error, if any.
+    pub fn take_error(&mut self) -> Option<EcoString> {
+        self.error.take()
+    }
+}
+
+impl Lexer<'_> {
+    /// Construct a full-positioned syntax error.
+    fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
+        self.error = Some(message.into());
+        SyntaxKind::Error
+    }
+}
+
+/// Shared.
+impl Lexer<'_> {
+    pub fn next(&mut self) -> SyntaxKind {
+        self.newline = false;
+        self.error = None;
+        let start = self.s.cursor();
+        match self.s.eat() {
+            Some(c) if c.is_whitespace() => self.whitespace(start, c),
+            Some('/') if self.s.eat_if('/') => self.line_comment(),
+            Some('/') if self.s.eat_if('*') => self.block_comment(),
+            Some('*') if self.s.eat_if('/') => {
+                self.error("unexpected end of block comment")
+            }
+
+            Some(c) => match self.mode {
+                LexMode::Markup => self.markup(start, c),
+                LexMode::Math => self.math(start, c),
+                LexMode::Code => self.code(start, c),
+            },
+
+            None => SyntaxKind::Eof,
+        }
+    }
+
+    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
+        let more = self.s.eat_while(char::is_whitespace);
+        let newlines = match c {
+            ' ' if more.is_empty() => 0,
+            _ => count_newlines(self.s.from(start)),
+        };
+
+        self.newline = newlines > 0;
+        if self.mode == LexMode::Markup && newlines >= 2 {
+            SyntaxKind::Parbreak
+        } else {
+            SyntaxKind::Space
+        }
+    }
+
+    fn line_comment(&mut self) -> SyntaxKind {
+        self.s.eat_until(is_newline);
+        SyntaxKind::LineComment
+    }
+
+    fn block_comment(&mut self) -> SyntaxKind {
+        let mut state = '_';
+        let mut depth = 1;
+
+        // Find the first `*/` that does not correspond to a nested `/*`.
+        while let Some(c) = self.s.eat() {
+            state = match (state, c) {
+                ('*', '/') => {
+                    depth -= 1;
+                    if depth == 0 {
+                        break;
+                    }
+                    '_'
+                }
+                ('/', '*') => {
+                    depth += 1;
+                    '_'
+                }
+                ('/', '/') => {
+                    self.line_comment();
+                    '_'
+                }
+                _ => c,
+            }
+        }
+
+        SyntaxKind::BlockComment
+    }
+}
+
+/// Markup.
+impl Lexer<'_> {
+    fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
+        match c {
+            '\\' => self.backslash(),
+            '`' => self.raw(),
+            'h' if self.s.eat_if("ttp://") => self.link(),
+            'h' if self.s.eat_if("ttps://") => self.link(),
+            '<' if self.s.at(is_id_continue) => self.label(),
+            '@' => self.ref_marker(),
+
+            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
+            '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
+            '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
+            '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
+            '*' if !self.in_word() => SyntaxKind::Star,
+            '_' if !self.in_word() => SyntaxKind::Underscore,
+
+            '#' => SyntaxKind::Hashtag,
+            '[' => SyntaxKind::LeftBracket,
+            ']' => SyntaxKind::RightBracket,
+            '\'' => SyntaxKind::SmartQuote,
+            '"' => SyntaxKind::SmartQuote,
+            '$' => SyntaxKind::Dollar,
+            '~' => SyntaxKind::Shorthand,
+            ':' => SyntaxKind::Colon,
+            '=' => {
+                self.s.eat_while('=');
+                if self.space_or_end() {
+                    SyntaxKind::HeadingMarker
+                } else {
+                    self.text()
+                }
+            }
+            '-' if self.space_or_end() => SyntaxKind::ListMarker,
+            '+' if self.space_or_end() => SyntaxKind::EnumMarker,
+            '/' if self.space_or_end() => SyntaxKind::TermMarker,
+            '0'..='9' => self.numbering(start),
+
+            _ => self.text(),
+        }
+    }
+
+    fn backslash(&mut self) -> SyntaxKind {
+        if self.s.eat_if("u{") {
+            let hex = self.s.eat_while(char::is_ascii_alphanumeric);
+            if !self.s.eat_if('}') {
+                return self.error("unclosed Unicode escape sequence");
+            }
+
+            if u32::from_str_radix(hex, 16)
+                .ok()
+                .and_then(std::char::from_u32)
+                .is_none()
+            {
+                return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
+            }
+
+            return SyntaxKind::Escape;
+        }
+
+        if self.s.done() || self.s.at(char::is_whitespace) {
+            SyntaxKind::Linebreak
+        } else {
+            self.s.eat();
+            SyntaxKind::Escape
+        }
+    }
+
+    fn raw(&mut self) -> SyntaxKind {
+        let mut backticks = 1;
+        while self.s.eat_if('`') {
+            backticks += 1;
+        }
+
+        if backticks == 2 {
+            return SyntaxKind::Raw;
+        }
+
+        let mut found = 0;
+        while found < backticks {
+            match self.s.eat() {
+                Some('`') => found += 1,
+                Some(_) => found = 0,
+                None => break,
+            }
+        }
+
+        if found != backticks {
+            return self.error("unclosed raw text");
+        }
+
+        SyntaxKind::Raw
+    }
+
+    fn link(&mut self) -> SyntaxKind {
+        let mut brackets = Vec::new();
+
+        #[rustfmt::skip]
+        self.s.eat_while(|c: char| {
+            match c {
+                | '0' ..= '9'
+                | 'a' ..= 'z'
+                | 'A' ..= 'Z'
+                | '!' | '#' | '$' | '%' | '&' | '*' | '+'
+                | ',' | '-' | '.' | '/' | ':' | ';' | '='
+                | '?' | '@' | '_' | '~' | '\'' => true,
+                '[' => {
+                    brackets.push(SyntaxKind::LeftBracket);
+                    true
+                }
+                '(' => {
+                    brackets.push(SyntaxKind::LeftParen);
+                    true
+                }
+                ']' => brackets.pop() == Some(SyntaxKind::LeftBracket),
+                ')' => brackets.pop() == Some(SyntaxKind::LeftParen),
+                _ => false,
+            }
+        });
+
+        if !brackets.is_empty() {
+            return self.error(
+                "automatic links cannot contain unbalanced brackets, \
+                 use the `link` function instead",
+            );
+        }
+
+        // Don't include the trailing characters likely to be part of text.
+        while matches!(self.s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
+            self.s.uneat();
+        }
+
+        SyntaxKind::Link
+    }
+
+    fn numbering(&mut self, start: usize) -> SyntaxKind {
+        self.s.eat_while(char::is_ascii_digit);
+
+        let read = self.s.from(start);
+        if self.s.eat_if('.') && self.space_or_end() && read.parse::<usize>().is_ok() {
+            return SyntaxKind::EnumMarker;
+        }
+
+        self.text()
+    }
+
+    fn ref_marker(&mut self) -> SyntaxKind {
+        self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.'));
+
+        // Don't include the trailing characters likely to be part of text.
+        while matches!(self.s.scout(-1), Some('.' | ':')) {
+            self.s.uneat();
+        }
+
+        SyntaxKind::RefMarker
+    }
+
+    fn label(&mut self) -> SyntaxKind {
+        let label = self.s.eat_while(|c| is_id_continue(c) || matches!(c, ':' | '.'));
+        if label.is_empty() {
+            return self.error("label cannot be empty");
+        }
+
+        if !self.s.eat_if('>') {
+            return self.error("unclosed label");
+        }
+
+        SyntaxKind::Label
+    }
+
+    fn text(&mut self) -> SyntaxKind {
+        macro_rules! table {
+            ($(|$c:literal)*) => {
+                static TABLE: [bool; 128] = {
+                    let mut t = [false; 128];
+                    $(t[$c as usize] = true;)*
+                    t
+                };
+            };
+        }
+
+        table! {
+            | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
+            | '[' | ']' | '{' | '}' | '~' | '-' | '.' | '\'' | '"'
+            | '*' | '_' | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
+        };
+
+        loop {
+            self.s.eat_until(|c: char| {
+                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
+            });
+
+            // Continue with the same text node if the thing would become text
+            // anyway.
+            let mut s = self.s;
+            match s.eat() {
+                Some(' ') if s.at(char::is_alphanumeric) => {}
+                Some('/') if !s.at(['/', '*']) => {}
+                Some('-') if !s.at(['-', '?']) => {}
+                Some('.') if !s.at("..") => {}
+                Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
+                Some('@') if !s.at(is_id_start) => {}
+                _ => break,
+            }
+
+            self.s = s;
+        }
+
+        SyntaxKind::Text
+    }
+
+    fn in_word(&self) -> bool {
+        let alphanum = |c: Option<char>| c.map_or(false, |c| c.is_alphanumeric());
+        let prev = self.s.scout(-2);
+        let next = self.s.peek();
+        alphanum(prev) && alphanum(next)
+    }
+
+    fn space_or_end(&self) -> bool {
+        self.s.done() || self.s.at(char::is_whitespace)
+    }
+}
+
+/// Math.
+impl Lexer<'_> {
+    fn math(&mut self, start: usize, c: char) -> SyntaxKind {
+        match c {
+            '\\' => self.backslash(),
+            '"' => self.string(),
+
+            '-' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
+            '-' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+            '-' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+            ':' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+            ':' if self.s.eat_if(":=") => SyntaxKind::Shorthand,
+            '!' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
+            '[' if self.s.eat_if('|') => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("==>") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("-->") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("--") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("-<") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("<-") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("<<") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("==") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if("~~") => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if('<') => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if('-') => SyntaxKind::Shorthand,
+            '<' if self.s.eat_if('~') => SyntaxKind::Shorthand,
+            '>' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+            '>' if self.s.eat_if(">>") => SyntaxKind::Shorthand,
+            '=' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+            '=' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+            '=' if self.s.eat_if(':') => SyntaxKind::Shorthand,
+            '>' if self.s.eat_if('=') => SyntaxKind::Shorthand,
+            '>' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+            '|' if self.s.eat_if("->") => SyntaxKind::Shorthand,
+            '|' if self.s.eat_if("=>") => SyntaxKind::Shorthand,
+            '|' if self.s.eat_if(']') => SyntaxKind::Shorthand,
+            '|' if self.s.eat_if('|') => SyntaxKind::Shorthand,
+            '~' if self.s.eat_if("~>") => SyntaxKind::Shorthand,
+            '~' if self.s.eat_if('>') => SyntaxKind::Shorthand,
+            '*' | '-' => SyntaxKind::Shorthand,
+
+            '#' => SyntaxKind::Hashtag,
+            '_' => SyntaxKind::Underscore,
+            '$' => SyntaxKind::Dollar,
+            '/' => SyntaxKind::Slash,
+            '^' => SyntaxKind::Hat,
+            '\'' => SyntaxKind::Prime,
+            '&' => SyntaxKind::MathAlignPoint,
+            '√' | '∛' | '∜' => SyntaxKind::Root,
+
+            // Identifiers.
+            c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
+                self.s.eat_while(is_math_id_continue);
+                SyntaxKind::MathIdent
+            }
+
+            // Other math atoms.
+            _ => self.math_text(start, c),
+        }
+    }
+
+    fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
+        // Keep numbers and grapheme clusters together.
+        if c.is_numeric() {
+            self.s.eat_while(char::is_numeric);
+            let mut s = self.s;
+            if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
+                self.s = s;
+            }
+        } else {
+            let len = self
+                .s
+                .get(start..self.s.string().len())
+                .graphemes(true)
+                .next()
+                .map_or(0, str::len);
+            self.s.jump(start + len);
+        }
+        SyntaxKind::Text
+    }
+}
+
+/// Code.
+impl Lexer<'_> {
+    fn code(&mut self, start: usize, c: char) -> SyntaxKind {
+        match c {
+            '`' => self.raw(),
+            '<' if self.s.at(is_id_continue) => self.label(),
+            '0'..='9' => self.number(start, c),
+            '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
+            '"' => self.string(),
+
+            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
+            '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
+            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
+            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
+            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
+            '-' if self.s.eat_if('=') => SyntaxKind::HyphEq,
+            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
+            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
+            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
+            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
+
+            '{' => SyntaxKind::LeftBrace,
+            '}' => SyntaxKind::RightBrace,
+            '[' => SyntaxKind::LeftBracket,
+            ']' => SyntaxKind::RightBracket,
+            '(' => SyntaxKind::LeftParen,
+            ')' => SyntaxKind::RightParen,
+            '$' => SyntaxKind::Dollar,
+            ',' => SyntaxKind::Comma,
+            ';' => SyntaxKind::Semicolon,
+            ':' => SyntaxKind::Colon,
+            '.' => SyntaxKind::Dot,
+            '+' => SyntaxKind::Plus,
+            '-' => SyntaxKind::Minus,
+            '*' => SyntaxKind::Star,
+            '/' => SyntaxKind::Slash,
+            '=' => SyntaxKind::Eq,
+            '<' => SyntaxKind::Lt,
+            '>' => SyntaxKind::Gt,
+
+            c if is_id_start(c) => self.ident(start),
+
+            c => self.error(eco_format!("the character `{c}` is not valid in code")),
+        }
+    }
+
+    fn ident(&mut self, start: usize) -> SyntaxKind {
+        self.s.eat_while(is_id_continue);
+        let ident = self.s.from(start);
+
+        let prev = self.s.get(0..start);
+        if !prev.ends_with(['.', '@']) || prev.ends_with("..") {
+            if let Some(keyword) = keyword(ident) {
+                return keyword;
+            }
+        }
+
+        if ident == "_" {
+            SyntaxKind::Underscore
+        } else {
+            SyntaxKind::Ident
+        }
+    }
+
+    fn number(&mut self, mut start: usize, c: char) -> SyntaxKind {
+        // Handle alternative integer bases.
+        let mut base = 10;
+        if c == '0' {
+            if self.s.eat_if('b') {
+                base = 2;
+            } else if self.s.eat_if('o') {
+                base = 8;
+            } else if self.s.eat_if('x') {
+                base = 16;
+            }
+            if base != 10 {
+                start = self.s.cursor();
+            }
+        }
+
+        // Read the first part (integer or fractional depending on `first`).
+        self.s.eat_while(if base == 16 {
+            char::is_ascii_alphanumeric
+        } else {
+            char::is_ascii_digit
+        });
+
+        // Read the fractional part if not already done.
+        // Make sure not to confuse a range for the decimal separator.
+        if c != '.'
+            && !self.s.at("..")
+            && !self.s.scout(1).map_or(false, is_id_start)
+            && self.s.eat_if('.')
+            && base == 10
+        {
+            self.s.eat_while(char::is_ascii_digit);
+        }
+
+        // Read the exponent.
+        if !self.s.at("em") && self.s.eat_if(['e', 'E']) && base == 10 {
+            self.s.eat_if(['+', '-']);
+            self.s.eat_while(char::is_ascii_digit);
+        }
+
+        // Read the suffix.
+        let suffix_start = self.s.cursor();
+        if !self.s.eat_if('%') {
+            self.s.eat_while(char::is_ascii_alphanumeric);
+        }
+
+        let number = self.s.get(start..suffix_start);
+        let suffix = self.s.from(suffix_start);
+
+        let kind = if i64::from_str_radix(number, base).is_ok() {
+            SyntaxKind::Int
+        } else if base == 10 && number.parse::<f64>().is_ok() {
+            SyntaxKind::Float
+        } else {
+            return self.error(match base {
+                2 => eco_format!("invalid binary number: 0b{}", number),
+                8 => eco_format!("invalid octal number: 0o{}", number),
+                16 => eco_format!("invalid hexadecimal number: 0x{}", number),
+                _ => eco_format!("invalid number: {}", number),
+            });
+        };
+
+        if suffix.is_empty() {
+            return kind;
+        }
+
+        if !matches!(
+            suffix,
+            "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%"
+        ) {
+            return self.error(eco_format!("invalid number suffix: {}", suffix));
+        }
+
+        SyntaxKind::Numeric
+    }
+
+    fn string(&mut self) -> SyntaxKind {
+        let mut escaped = false;
+        self.s.eat_until(|c| {
+            let stop = c == '"' && !escaped;
+            escaped = c == '\\' && !escaped;
+            stop
+        });
+
+        if !self.s.eat_if('"') {
+            return self.error("unclosed string");
+        }
+
+        SyntaxKind::Str
+    }
+}
+
+/// Try to parse an identifier into a keyword.
+fn keyword(ident: &str) -> Option<SyntaxKind> {
+    Some(match ident {
+        "none" => SyntaxKind::None,
+        "auto" => SyntaxKind::Auto,
+        "true" => SyntaxKind::Bool,
+        "false" => SyntaxKind::Bool,
+        "not" => SyntaxKind::Not,
+        "and" => SyntaxKind::And,
+        "or" => SyntaxKind::Or,
+        "let" => SyntaxKind::Let,
+        "set" => SyntaxKind::Set,
+        "show" => SyntaxKind::Show,
+        "if" => SyntaxKind::If,
+        "else" => SyntaxKind::Else,
+        "for" => SyntaxKind::For,
+        "in" => SyntaxKind::In,
+        "while" => SyntaxKind::While,
+        "break" => SyntaxKind::Break,
+        "continue" => SyntaxKind::Continue,
+        "return" => SyntaxKind::Return,
+        "import" => SyntaxKind::Import,
+        "include" => SyntaxKind::Include,
+        "as" => SyntaxKind::As,
+        _ => return None,
+    })
+}
+
+/// Whether a character is interpreted as a newline by Typst.
+#[inline]
+pub fn is_newline(character: char) -> bool {
+    matches!(
+        character,
+        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+        '\n' | '\x0B' | '\x0C' | '\r' |
+        // Next Line, Line Separator, Paragraph Separator.
+        '\u{0085}' | '\u{2028}' | '\u{2029}'
+    )
+}
+
+/// Split text at newlines.
+pub(super) fn split_newlines(text: &str) -> Vec<&str> {
+    let mut s = Scanner::new(text);
+    let mut lines = Vec::new();
+    let mut start = 0;
+    let mut end = 0;
+
+    while let Some(c) = s.eat() {
+        if is_newline(c) {
+            if c == '\r' {
+                s.eat_if('\n');
+            }
+
+            lines.push(&text[start..end]);
+            start = s.cursor();
+        }
+        end = s.cursor();
+    }
+
+    lines.push(&text[start..]);
+    lines
+}
+
+/// Count the number of newlines in text.
+fn count_newlines(text: &str) -> usize {
+    let mut newlines = 0;
+    let mut s = Scanner::new(text);
+    while let Some(c) = s.eat() {
+        if is_newline(c) {
+            if c == '\r' {
+                s.eat_if('\n');
+            }
+            newlines += 1;
+        }
+    }
+    newlines
+}
+
+/// Whether a string is a valid Typst identifier.
+///
+/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
+/// - `_` as a starting character,
+/// - `_` and `-` as continuing characters.
+///
+/// [uax31]: http://www.unicode.org/reports/tr31/
+#[inline]
+pub fn is_ident(string: &str) -> bool {
+    let mut chars = string.chars();
+    chars
+        .next()
+        .map_or(false, |c| is_id_start(c) && chars.all(is_id_continue))
+}
+
+/// Whether a character can start an identifier.
+#[inline]
+pub fn is_id_start(c: char) -> bool {
+    is_xid_start(c) || c == '_'
+}
+
+/// Whether a character can continue an identifier.
+#[inline]
+pub fn is_id_continue(c: char) -> bool {
+    is_xid_continue(c) || c == '_' || c == '-'
+}
+
+/// Whether a character can start an identifier in math.
+#[inline]
+fn is_math_id_start(c: char) -> bool {
+    is_xid_start(c)
+}
+
+/// Whether a character can continue an identifier in math.
+#[inline]
+fn is_math_id_continue(c: char) -> bool {
+    is_xid_continue(c) && c != '_'
+}
author	Laurenz <laurmaedje@gmail.com>	2023-07-18 20:11:31 +0200
committer	Laurenz <laurmaedje@gmail.com>	2023-07-18 21:04:46 +0200
commit	f5953887c9ae0b40a0c3e0ab516daf425c5a598c (patch)
tree	b517ca68517e49bdf458bfa92036a8ff855c72f6 /crates/typst-syntax/src/lexer.rs
parent	7dc605307cf7d69a3476b8b6fc4786f683c3289b (diff)