summaryrefslogtreecommitdiff
path: root/src/parse
diff options
context:
space:
mode:
authorLaurenz <laurmaedje@gmail.com>2020-10-01 11:32:48 +0200
committerLaurenz <laurmaedje@gmail.com>2020-10-01 11:32:48 +0200
commit885bfec5d7524845b41e180fadc9cf5626157eec (patch)
treef798e03d101d568a110a5c56f4a9bfa2be892928 /src/parse
parent16f0bd430e0864a3bbd0139803e476be413cb3cb (diff)
Make syntax not depend on parse 📩
This would make it possible to split them into two separate crates.
Diffstat (limited to 'src/parse')
-rw-r--r--src/parse/lines.rs88
-rw-r--r--src/parse/mod.rs2
-rw-r--r--src/parse/resolve.rs14
-rw-r--r--src/parse/scanner.rs55
-rw-r--r--src/parse/tokens.rs12
5 files changed, 125 insertions, 46 deletions
diff --git a/src/parse/lines.rs b/src/parse/lines.rs
new file mode 100644
index 00000000..ce5a1fe5
--- /dev/null
+++ b/src/parse/lines.rs
@@ -0,0 +1,88 @@
+//! Conversion of byte positions to line/column locations.
+
+use super::Scanner;
+use crate::syntax::{Location, Pos};
+
+/// Enables conversion of byte position to locations.
+pub struct LineMap<'s> {
+ src: &'s str,
+ line_starts: Vec<Pos>,
+}
+
+impl<'s> LineMap<'s> {
+ /// Create a new line map for a source string.
+ pub fn new(src: &'s str) -> Self {
+ let mut line_starts = vec![Pos::ZERO];
+ let mut s = Scanner::new(src);
+
+ while let Some(c) = s.eat_merging_crlf() {
+ if is_newline(c) {
+ line_starts.push(s.index().into());
+ }
+ }
+
+ Self { src, line_starts }
+ }
+
+ /// Convert a byte position to a location.
+ ///
+ /// # Panics
+ /// This panics if the position is out of bounds.
+ pub fn location(&self, pos: Pos) -> Location {
+ let line_index = match self.line_starts.binary_search(&pos) {
+ Ok(i) => i,
+ Err(i) => i - 1,
+ };
+
+ let line_start = self.line_starts[line_index];
+ let head = &self.src[line_start.to_usize() .. pos.to_usize()];
+ let column_index = head.chars().count();
+
+ Location {
+ line: 1 + line_index as u32,
+ column: 1 + column_index as u32,
+ }
+ }
+}
+
+/// Whether this character denotes a newline.
+pub fn is_newline(character: char) -> bool {
+ match character {
+ // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+ '\n' | '\x0B' | '\x0C' | '\r' |
+ // Next Line, Line Separator, Paragraph Separator.
+ '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ const TEST: &str = "äbcde\nf💛g\r\nhi\rjkl";
+
+ #[test]
+ fn test_line_map_new() {
+ let map = LineMap::new(TEST);
+ assert_eq!(map.line_starts, vec![Pos(0), Pos(7), Pos(15), Pos(18)]);
+ }
+
+ #[test]
+ fn test_line_map_location() {
+ let map = LineMap::new(TEST);
+ assert_eq!(map.location(Pos(0)), Location::new(1, 1));
+ assert_eq!(map.location(Pos(2)), Location::new(1, 2));
+ assert_eq!(map.location(Pos(6)), Location::new(1, 6));
+ assert_eq!(map.location(Pos(7)), Location::new(2, 1));
+ assert_eq!(map.location(Pos(8)), Location::new(2, 2));
+ assert_eq!(map.location(Pos(12)), Location::new(2, 3));
+ assert_eq!(map.location(Pos(21)), Location::new(4, 4));
+ }
+
+ #[test]
+ #[should_panic]
+ fn test_line_map_panics_out_of_bounds() {
+ LineMap::new(TEST).location(Pos(22));
+ }
+}
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 8c879d12..b62bd5d3 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -1,9 +1,11 @@
//! Parsing and tokenization.
+mod lines;
mod resolve;
mod scanner;
mod tokens;
+pub use lines::*;
pub use resolve::*;
pub use scanner::*;
pub use tokens::*;
diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs
index 0e2ebd4b..d4babd25 100644
--- a/src/parse/resolve.rs
+++ b/src/parse/resolve.rs
@@ -1,6 +1,6 @@
//! Resolve strings and raw blocks.
-use super::{is_newline_char, Scanner};
+use super::{is_newline, Scanner};
use crate::syntax::{Ident, Raw};
/// Resolves all escape sequences in a string.
@@ -42,8 +42,8 @@ pub fn resolve_string(string: &str) -> String {
out
}
-/// Resolve a hexademical escape sequence (only the inner hex letters without
-/// braces or `\u`) into a character.
+/// Resolve a hexademical escape sequence into a character
+/// (only the inner hex letters without braces or `\u`).
pub fn resolve_hex(sequence: &str) -> Option<char> {
u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
}
@@ -71,7 +71,7 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> Raw {
fn split_at_lang_tag(raw: &str) -> (&str, &str) {
let mut s = Scanner::new(raw);
(
- s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)),
+ s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)),
s.rest(),
)
}
@@ -101,15 +101,15 @@ fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
(lines, had_newline)
}
-/// Splits a string into a vector of lines (respecting Unicode & Windows line
-/// breaks).
+/// Splits a string into a vector of lines
+/// (respecting Unicode, Unix, Mac and Windows line breaks).
pub fn split_lines(text: &str) -> Vec<String> {
let mut s = Scanner::new(text);
let mut line = String::new();
let mut lines = Vec::new();
while let Some(c) = s.eat_merging_crlf() {
- if is_newline_char(c) {
+ if is_newline(c) {
lines.push(std::mem::take(&mut line));
} else {
line.push(c);
diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs
index 1bffc204..9447222d 100644
--- a/src/parse/scanner.rs
+++ b/src/parse/scanner.rs
@@ -102,9 +102,28 @@ impl<'s> Scanner<'s> {
pub fn check(&self, f: impl FnMut(char) -> bool) -> bool {
self.peek().map(f).unwrap_or(false)
}
+
+ /// Go back to the where the index says.
+ fn reset(&mut self) {
+ self.iter = self.src[self.index ..].chars();
+ }
}
impl<'s> Scanner<'s> {
+ /// The current index in the string.
+ pub fn index(&self) -> usize {
+ self.index
+ }
+
+ /// The previous index in the string.
+ pub fn prev_index(&self) -> usize {
+ self.src[.. self.index]
+ .chars()
+ .next_back()
+ .map(|c| self.index - c.len_utf8())
+ .unwrap_or(0)
+ }
+
/// Slice a part out of the source string.
pub fn get<I>(&self, index: I) -> &'s str
where
@@ -118,39 +137,20 @@ impl<'s> Scanner<'s> {
self.src
}
- /// The full string up to the current index.
+ /// The full source string up to the current index.
pub fn eaten(&self) -> &'s str {
&self.src[.. self.index]
}
- /// The string from `start` to the current index.
+ /// The source string from `start` to the current index.
pub fn eaten_from(&self, start: usize) -> &'s str {
&self.src[start .. self.index]
}
- /// The remaining string after the current index.
+ /// The remaining source string after the current index.
pub fn rest(&self) -> &'s str {
&self.src[self.index ..]
}
-
- /// The current index in the string.
- pub fn index(&self) -> usize {
- self.index
- }
-
- /// The previous index in the string.
- pub fn prev_index(&self) -> usize {
- self.src[.. self.index]
- .chars()
- .next_back()
- .map(|c| self.index - c.len_utf8())
- .unwrap_or(0)
- }
-
- /// Go back to the where the index says.
- fn reset(&mut self) {
- self.iter = self.src[self.index ..].chars();
- }
}
impl Debug for Scanner<'_> {
@@ -158,14 +158,3 @@ impl Debug for Scanner<'_> {
write!(f, "Scanner({}|{})", self.eaten(), self.rest())
}
}
-
-/// Whether this character denotes a newline.
-pub fn is_newline_char(character: char) -> bool {
- match character {
- // Line Feed, Vertical Tab, Form Feed, Carriage Return.
- '\n' | '\x0B' | '\x0C' | '\r' |
- // Next Line, Line Separator, Paragraph Separator.
- '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
- _ => false,
- }
-}
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index cdb92c59..9f30f587 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -1,8 +1,8 @@
//! Tokenization.
-use super::{is_newline_char, Scanner};
+use super::{is_newline, Scanner};
use crate::length::Length;
-use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token};
+use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token};
use TokenMode::*;
@@ -115,7 +115,7 @@ impl<'s> Tokens<'s> {
// Uneat the first char if it's a newline, so that it's counted in the
// loop.
- if is_newline_char(first) {
+ if is_newline(first) {
self.s.uneat();
}
@@ -127,7 +127,7 @@ impl<'s> Tokens<'s> {
break;
}
- if is_newline_char(c) {
+ if is_newline(c) {
newlines += 1;
}
}
@@ -136,7 +136,7 @@ impl<'s> Tokens<'s> {
}
fn read_line_comment(&mut self) -> Token<'s> {
- Token::LineComment(self.s.eat_until(is_newline_char))
+ Token::LineComment(self.s.eat_until(is_newline))
}
fn read_block_comment(&mut self) -> Token<'s> {
@@ -277,7 +277,7 @@ fn parse_expr(text: &str) -> Token<'_> {
Token::Number(num / 100.0)
} else if let Ok(length) = text.parse::<Length>() {
Token::Length(length)
- } else if Ident::is_ident(text) {
+ } else if is_ident(text) {
Token::Ident(text)
} else {
Token::Invalid(text)