From 885bfec5d7524845b41e180fadc9cf5626157eec Mon Sep 17 00:00:00 2001 From: Laurenz Date: Thu, 1 Oct 2020 11:32:48 +0200 Subject: =?UTF-8?q?Make=20syntax=20not=20depend=20on=20parse=20?= =?UTF-8?q?=F0=9F=93=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This would make it possible to split them into two separate crates. --- src/parse/lines.rs | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/parse/mod.rs | 2 ++ src/parse/resolve.rs | 14 ++++----- src/parse/scanner.rs | 55 +++++++++++++------------------- src/parse/tokens.rs | 12 +++---- 5 files changed, 125 insertions(+), 46 deletions(-) create mode 100644 src/parse/lines.rs (limited to 'src/parse') diff --git a/src/parse/lines.rs b/src/parse/lines.rs new file mode 100644 index 00000000..ce5a1fe5 --- /dev/null +++ b/src/parse/lines.rs @@ -0,0 +1,88 @@ +//! Conversion of byte positions to line/column locations. + +use super::Scanner; +use crate::syntax::{Location, Pos}; + +/// Enables conversion of byte position to locations. +pub struct LineMap<'s> { + src: &'s str, + line_starts: Vec, +} + +impl<'s> LineMap<'s> { + /// Create a new line map for a source string. + pub fn new(src: &'s str) -> Self { + let mut line_starts = vec![Pos::ZERO]; + let mut s = Scanner::new(src); + + while let Some(c) = s.eat_merging_crlf() { + if is_newline(c) { + line_starts.push(s.index().into()); + } + } + + Self { src, line_starts } + } + + /// Convert a byte position to a location. + /// + /// # Panics + /// This panics if the position is out of bounds. + pub fn location(&self, pos: Pos) -> Location { + let line_index = match self.line_starts.binary_search(&pos) { + Ok(i) => i, + Err(i) => i - 1, + }; + + let line_start = self.line_starts[line_index]; + let head = &self.src[line_start.to_usize() .. pos.to_usize()]; + let column_index = head.chars().count(); + + Location { + line: 1 + line_index as u32, + column: 1 + column_index as u32, + } + } +} + +/// Whether this character denotes a newline. +pub fn is_newline(character: char) -> bool { + match character { + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const TEST: &str = "äbcde\nf💛g\r\nhi\rjkl"; + + #[test] + fn test_line_map_new() { + let map = LineMap::new(TEST); + assert_eq!(map.line_starts, vec![Pos(0), Pos(7), Pos(15), Pos(18)]); + } + + #[test] + fn test_line_map_location() { + let map = LineMap::new(TEST); + assert_eq!(map.location(Pos(0)), Location::new(1, 1)); + assert_eq!(map.location(Pos(2)), Location::new(1, 2)); + assert_eq!(map.location(Pos(6)), Location::new(1, 6)); + assert_eq!(map.location(Pos(7)), Location::new(2, 1)); + assert_eq!(map.location(Pos(8)), Location::new(2, 2)); + assert_eq!(map.location(Pos(12)), Location::new(2, 3)); + assert_eq!(map.location(Pos(21)), Location::new(4, 4)); + } + + #[test] + #[should_panic] + fn test_line_map_panics_out_of_bounds() { + LineMap::new(TEST).location(Pos(22)); + } +} diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 8c879d12..b62bd5d3 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,9 +1,11 @@ //! Parsing and tokenization. +mod lines; mod resolve; mod scanner; mod tokens; +pub use lines::*; pub use resolve::*; pub use scanner::*; pub use tokens::*; diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index 0e2ebd4b..d4babd25 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -1,6 +1,6 @@ //! Resolve strings and raw blocks. -use super::{is_newline_char, Scanner}; +use super::{is_newline, Scanner}; use crate::syntax::{Ident, Raw}; /// Resolves all escape sequences in a string. @@ -42,8 +42,8 @@ pub fn resolve_string(string: &str) -> String { out } -/// Resolve a hexademical escape sequence (only the inner hex letters without -/// braces or `\u`) into a character. +/// Resolve a hexademical escape sequence into a character +/// (only the inner hex letters without braces or `\u`). pub fn resolve_hex(sequence: &str) -> Option { u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32) } @@ -71,7 +71,7 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> Raw { fn split_at_lang_tag(raw: &str) -> (&str, &str) { let mut s = Scanner::new(raw); ( - s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)), + s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)), s.rest(), ) } @@ -101,15 +101,15 @@ fn trim_and_split_raw(raw: &str) -> (Vec, bool) { (lines, had_newline) } -/// Splits a string into a vector of lines (respecting Unicode & Windows line -/// breaks). +/// Splits a string into a vector of lines +/// (respecting Unicode, Unix, Mac and Windows line breaks). pub fn split_lines(text: &str) -> Vec { let mut s = Scanner::new(text); let mut line = String::new(); let mut lines = Vec::new(); while let Some(c) = s.eat_merging_crlf() { - if is_newline_char(c) { + if is_newline(c) { lines.push(std::mem::take(&mut line)); } else { line.push(c); diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index 1bffc204..9447222d 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -102,9 +102,28 @@ impl<'s> Scanner<'s> { pub fn check(&self, f: impl FnMut(char) -> bool) -> bool { self.peek().map(f).unwrap_or(false) } + + /// Go back to the where the index says. + fn reset(&mut self) { + self.iter = self.src[self.index ..].chars(); + } } impl<'s> Scanner<'s> { + /// The current index in the string. + pub fn index(&self) -> usize { + self.index + } + + /// The previous index in the string. + pub fn prev_index(&self) -> usize { + self.src[.. self.index] + .chars() + .next_back() + .map(|c| self.index - c.len_utf8()) + .unwrap_or(0) + } + /// Slice a part out of the source string. pub fn get(&self, index: I) -> &'s str where @@ -118,39 +137,20 @@ impl<'s> Scanner<'s> { self.src } - /// The full string up to the current index. + /// The full source string up to the current index. pub fn eaten(&self) -> &'s str { &self.src[.. self.index] } - /// The string from `start` to the current index. + /// The source string from `start` to the current index. pub fn eaten_from(&self, start: usize) -> &'s str { &self.src[start .. self.index] } - /// The remaining string after the current index. + /// The remaining source string after the current index. pub fn rest(&self) -> &'s str { &self.src[self.index ..] } - - /// The current index in the string. - pub fn index(&self) -> usize { - self.index - } - - /// The previous index in the string. - pub fn prev_index(&self) -> usize { - self.src[.. self.index] - .chars() - .next_back() - .map(|c| self.index - c.len_utf8()) - .unwrap_or(0) - } - - /// Go back to the where the index says. - fn reset(&mut self) { - self.iter = self.src[self.index ..].chars(); - } } impl Debug for Scanner<'_> { @@ -158,14 +158,3 @@ impl Debug for Scanner<'_> { write!(f, "Scanner({}|{})", self.eaten(), self.rest()) } } - -/// Whether this character denotes a newline. -pub fn is_newline_char(character: char) -> bool { - match character { - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } -} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index cdb92c59..9f30f587 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -1,8 +1,8 @@ //! Tokenization. -use super::{is_newline_char, Scanner}; +use super::{is_newline, Scanner}; use crate::length::Length; -use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token}; +use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token}; use TokenMode::*; @@ -115,7 +115,7 @@ impl<'s> Tokens<'s> { // Uneat the first char if it's a newline, so that it's counted in the // loop. - if is_newline_char(first) { + if is_newline(first) { self.s.uneat(); } @@ -127,7 +127,7 @@ impl<'s> Tokens<'s> { break; } - if is_newline_char(c) { + if is_newline(c) { newlines += 1; } } @@ -136,7 +136,7 @@ impl<'s> Tokens<'s> { } fn read_line_comment(&mut self) -> Token<'s> { - Token::LineComment(self.s.eat_until(is_newline_char)) + Token::LineComment(self.s.eat_until(is_newline)) } fn read_block_comment(&mut self) -> Token<'s> { @@ -277,7 +277,7 @@ fn parse_expr(text: &str) -> Token<'_> { Token::Number(num / 100.0) } else if let Ok(length) = text.parse::() { Token::Length(length) - } else if Ident::is_ident(text) { + } else if is_ident(text) { Token::Ident(text) } else { Token::Invalid(text) -- cgit v1.2.3