Make syntax not depend on parse 📩

This would make it possible to split them into two separate crates.
author: Laurenz <laurmaedje@gmail.com> 2020-10-01 11:32:48 +0200
committer: Laurenz <laurmaedje@gmail.com> 2020-10-01 11:32:48 +0200
commit: 885bfec5d7524845b41e180fadc9cf5626157eec (patch)
tree: f798e03d101d568a110a5c56f4a9bfa2be892928 /src/parse
parent: 16f0bd430e0864a3bbd0139803e476be413cb3cb (diff)
5 files changed, 125 insertions, 46 deletions
diff --git a/src/parse/lines.rs b/src/parse/lines.rs
new file mode 100644
index 00000000..ce5a1fe5
--- /dev/null
+++ b/src/parse/lines.rs
@@ -0,0 +1,88 @@
+//! Conversion of byte positions to line/column locations.
+
+use super::Scanner;
+use crate::syntax::{Location, Pos};
+
+/// Enables conversion of byte position to locations.
+pub struct LineMap<'s> {
+    src: &'s str,
+    line_starts: Vec<Pos>,
+}
+
+impl<'s> LineMap<'s> {
+    /// Create a new line map for a source string.
+    pub fn new(src: &'s str) -> Self {
+        let mut line_starts = vec![Pos::ZERO];
+        let mut s = Scanner::new(src);
+
+        while let Some(c) = s.eat_merging_crlf() {
+            if is_newline(c) {
+                line_starts.push(s.index().into());
+            }
+        }
+
+        Self { src, line_starts }
+    }
+
+    /// Convert a byte position to a location.
+    ///
+    /// # Panics
+    /// This panics if the position is out of bounds.
+    pub fn location(&self, pos: Pos) -> Location {
+        let line_index = match self.line_starts.binary_search(&pos) {
+            Ok(i) => i,
+            Err(i) => i - 1,
+        };
+
+        let line_start = self.line_starts[line_index];
+        let head = &self.src[line_start.to_usize() .. pos.to_usize()];
+        let column_index = head.chars().count();
+
+        Location {
+            line: 1 + line_index as u32,
+            column: 1 + column_index as u32,
+        }
+    }
+}
+
+/// Whether this character denotes a newline.
+pub fn is_newline(character: char) -> bool {
+    match character {
+        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
+        '\n' | '\x0B' | '\x0C' | '\r' |
+        // Next Line, Line Separator, Paragraph Separator.
+        '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const TEST: &str = "äbcde\nf💛g\r\nhi\rjkl";
+
+    #[test]
+    fn test_line_map_new() {
+        let map = LineMap::new(TEST);
+        assert_eq!(map.line_starts, vec![Pos(0), Pos(7), Pos(15), Pos(18)]);
+    }
+
+    #[test]
+    fn test_line_map_location() {
+        let map = LineMap::new(TEST);
+        assert_eq!(map.location(Pos(0)), Location::new(1, 1));
+        assert_eq!(map.location(Pos(2)), Location::new(1, 2));
+        assert_eq!(map.location(Pos(6)), Location::new(1, 6));
+        assert_eq!(map.location(Pos(7)), Location::new(2, 1));
+        assert_eq!(map.location(Pos(8)), Location::new(2, 2));
+        assert_eq!(map.location(Pos(12)), Location::new(2, 3));
+        assert_eq!(map.location(Pos(21)), Location::new(4, 4));
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_line_map_panics_out_of_bounds() {
+        LineMap::new(TEST).location(Pos(22));
+    }
+}
diff --git a/src/parse/mod.rs b/src/parse/mod.rs
index 8c879d12..b62bd5d3 100644
--- a/src/parse/mod.rs
+++ b/src/parse/mod.rs
@@ -1,9 +1,11 @@
 //! Parsing and tokenization.
 
+mod lines;
 mod resolve;
 mod scanner;
 mod tokens;
 
+pub use lines::*;
 pub use resolve::*;
 pub use scanner::*;
 pub use tokens::*;
diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs
index 0e2ebd4b..d4babd25 100644
--- a/src/parse/resolve.rs
+++ b/src/parse/resolve.rs
@@ -1,6 +1,6 @@
 //! Resolve strings and raw blocks.
 
-use super::{is_newline_char, Scanner};
+use super::{is_newline, Scanner};
 use crate::syntax::{Ident, Raw};
 
 /// Resolves all escape sequences in a string.
@@ -42,8 +42,8 @@ pub fn resolve_string(string: &str) -> String {
     out
 }
 
-/// Resolve a hexademical escape sequence (only the inner hex letters without
-/// braces or `\u`) into a character.
+/// Resolve a hexademical escape sequence into a character
+/// (only the inner hex letters without braces or `\u`).
 pub fn resolve_hex(sequence: &str) -> Option<char> {
     u32::from_str_radix(sequence, 16).ok().and_then(std::char::from_u32)
 }
@@ -71,7 +71,7 @@ pub fn resolve_raw(raw: &str, backticks: usize) -> Raw {
 fn split_at_lang_tag(raw: &str) -> (&str, &str) {
     let mut s = Scanner::new(raw);
     (
-        s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline_char(c)),
+        s.eat_until(|c| c == '`' || c.is_whitespace() || is_newline(c)),
         s.rest(),
     )
 }
@@ -101,15 +101,15 @@ fn trim_and_split_raw(raw: &str) -> (Vec<String>, bool) {
     (lines, had_newline)
 }
 
-/// Splits a string into a vector of lines (respecting Unicode & Windows line
-/// breaks).
+/// Splits a string into a vector of lines
+/// (respecting Unicode, Unix, Mac and Windows line breaks).
 pub fn split_lines(text: &str) -> Vec<String> {
     let mut s = Scanner::new(text);
     let mut line = String::new();
     let mut lines = Vec::new();
 
     while let Some(c) = s.eat_merging_crlf() {
-        if is_newline_char(c) {
+        if is_newline(c) {
             lines.push(std::mem::take(&mut line));
         } else {
             line.push(c);
diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs
index 1bffc204..9447222d 100644
--- a/src/parse/scanner.rs
+++ b/src/parse/scanner.rs
@@ -102,9 +102,28 @@ impl<'s> Scanner<'s> {
     pub fn check(&self, f: impl FnMut(char) -> bool) -> bool {
         self.peek().map(f).unwrap_or(false)
     }
+
+    /// Go back to the where the index says.
+    fn reset(&mut self) {
+        self.iter = self.src[self.index ..].chars();
+    }
 }
 
 impl<'s> Scanner<'s> {
+    /// The current index in the string.
+    pub fn index(&self) -> usize {
+        self.index
+    }
+
+    /// The previous index in the string.
+    pub fn prev_index(&self) -> usize {
+        self.src[.. self.index]
+            .chars()
+            .next_back()
+            .map(|c| self.index - c.len_utf8())
+            .unwrap_or(0)
+    }
+
     /// Slice a part out of the source string.
     pub fn get<I>(&self, index: I) -> &'s str
     where
@@ -118,39 +137,20 @@ impl<'s> Scanner<'s> {
         self.src
     }
 
-    /// The full string up to the current index.
+    /// The full source string up to the current index.
     pub fn eaten(&self) -> &'s str {
         &self.src[.. self.index]
     }
 
-    /// The string from `start` to the current index.
+    /// The source string from `start` to the current index.
     pub fn eaten_from(&self, start: usize) -> &'s str {
         &self.src[start .. self.index]
     }
 
-    /// The remaining string after the current index.
+    /// The remaining source string after the current index.
     pub fn rest(&self) -> &'s str {
         &self.src[self.index ..]
     }
-
-    /// The current index in the string.
-    pub fn index(&self) -> usize {
-        self.index
-    }
-
-    /// The previous index in the string.
-    pub fn prev_index(&self) -> usize {
-        self.src[.. self.index]
-            .chars()
-            .next_back()
-            .map(|c| self.index - c.len_utf8())
-            .unwrap_or(0)
-    }
-
-    /// Go back to the where the index says.
-    fn reset(&mut self) {
-        self.iter = self.src[self.index ..].chars();
-    }
 }
 
 impl Debug for Scanner<'_> {
@@ -158,14 +158,3 @@ impl Debug for Scanner<'_> {
         write!(f, "Scanner({}|{})", self.eaten(), self.rest())
     }
 }
-
-/// Whether this character denotes a newline.
-pub fn is_newline_char(character: char) -> bool {
-    match character {
-        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
-        '\n' | '\x0B' | '\x0C' | '\r' |
-        // Next Line, Line Separator, Paragraph Separator.
-        '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
-        _ => false,
-    }
-}
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
index cdb92c59..9f30f587 100644
--- a/src/parse/tokens.rs
+++ b/src/parse/tokens.rs
@@ -1,8 +1,8 @@
 //! Tokenization.
 
-use super::{is_newline_char, Scanner};
+use super::{is_newline, Scanner};
 use crate::length::Length;
-use crate::syntax::{Ident, Pos, Span, SpanWith, Spanned, Token};
+use crate::syntax::{is_ident, Pos, Span, SpanWith, Spanned, Token};
 
 use TokenMode::*;
 
@@ -115,7 +115,7 @@ impl<'s> Tokens<'s> {
 
         // Uneat the first char if it's a newline, so that it's counted in the
         // loop.
-        if is_newline_char(first) {
+        if is_newline(first) {
             self.s.uneat();
         }
 
@@ -127,7 +127,7 @@ impl<'s> Tokens<'s> {
                 break;
             }
 
-            if is_newline_char(c) {
+            if is_newline(c) {
                 newlines += 1;
             }
         }
@@ -136,7 +136,7 @@ impl<'s> Tokens<'s> {
     }
 
     fn read_line_comment(&mut self) -> Token<'s> {
-        Token::LineComment(self.s.eat_until(is_newline_char))
+        Token::LineComment(self.s.eat_until(is_newline))
     }
 
     fn read_block_comment(&mut self) -> Token<'s> {
@@ -277,7 +277,7 @@ fn parse_expr(text: &str) -> Token<'_> {
         Token::Number(num / 100.0)
     } else if let Ok(length) = text.parse::<Length>() {
         Token::Length(length)
-    } else if Ident::is_ident(text) {
+    } else if is_ident(text) {
         Token::Ident(text)
     } else {
         Token::Invalid(text)
author	Laurenz <laurmaedje@gmail.com>	2020-10-01 11:32:48 +0200
committer	Laurenz <laurmaedje@gmail.com>	2020-10-01 11:32:48 +0200
commit	885bfec5d7524845b41e180fadc9cf5626157eec (patch)
tree	f798e03d101d568a110a5c56f4a9bfa2be892928 /src/parse
parent	16f0bd430e0864a3bbd0139803e476be413cb3cb (diff)