Convert spans to line/column format 📑

author: Laurenz <laurmaedje@gmail.com> 2020-01-10 13:03:43 +0100
committer: Laurenz <laurmaedje@gmail.com> 2020-01-10 13:03:43 +0100
commit: a75ddd2c9356da85b155f5c52fd064c15e6f81b3 (patch)
tree: 40d63a65d84945bd2cbf33449096e14e84babdf9 /src/syntax
parent: 5dbc7dc5aaaea794b140c5ea7839d681110d7b79 (diff)
3 files changed, 155 insertions, 69 deletions
diff --git a/src/syntax/parsing.rs b/src/syntax/parsing.rs
index 77d64f1e..dc39145a 100644
--- a/src/syntax/parsing.rs
+++ b/src/syntax/parsing.rs
@@ -117,8 +117,12 @@ impl<'s> Parser<'s> {
             _ => error!("expected arguments or closing bracket"),
         };
 
-        let func = self.parse_func_call(name, args)?;
-        span.end = self.tokens.string_index();
+        span.end = self.tokens.get_position();
+        let (func, body_span) = self.parse_func_call(name, args)?;
+
+        if let Some(body_span) = body_span {
+            span.expand(body_span);
+        }
 
         // Finally this function is parsed to the end.
         self.add(Node::Func(func), span);
@@ -139,7 +143,6 @@ impl<'s> Parser<'s> {
         };
 
         self.add_color_token(ColorToken::FuncName, name.span);
-
         self.skip_white();
 
         Ok(name)
@@ -231,7 +234,8 @@ impl<'s> Parser<'s> {
     }
 
     /// Parse a function call.
-    fn parse_func_call(&mut self, name: Spanned<Ident>, args: FuncArgs) -> ParseResult<FuncCall> {
+    fn parse_func_call(&mut self, name: Spanned<Ident>, args: FuncArgs)
+    -> ParseResult<(FuncCall, Option<Span>)> {
         // Now we want to parse this function dynamically.
         let parser = self
             .ctx
@@ -242,32 +246,38 @@ impl<'s> Parser<'s> {
         let has_body = self.tokens.peek().map(Spanned::value) == Some(Token::LeftBracket);
 
         // Do the parsing dependent on whether the function has a body.
-        Ok(FuncCall(if has_body {
+        Ok(if has_body {
             self.advance();
 
             // Find out the string which makes the body of this function.
-            let start = self.tokens.string_index();
-            let end = find_closing_bracket(&self.src[start..])
-                .map(|end| start + end)
-                .ok_or_else(|| error!(@"expected closing bracket"))?;
+            let start_index = self.tokens.string_index();
+            let mut start_pos = self.tokens.get_position();
+            start_pos.column -= 1;
 
-            let span = Span::new(start - 1, end + 1);
+            let (mut end_index, mut end_pos) =
+                find_closing_bracket(&self.src[start_index..])
+                    .ok_or_else(|| error!(@"expected closing bracket"))?;
+
+            end_index += start_index;
+            end_pos.column += 1;
+
+            let span = Span::new(start_pos, end_pos);
 
             // Parse the body.
-            let body_string = &self.src[start..end];
-            let body = parser(args, Some(Spanned::new(body_string, span)), self.ctx)?;
+            let body_string = &self.src[start_index..end_index];
+            let body = parser(args, Some(body_string), self.ctx)?;
 
             // Skip to the end of the function in the token stream.
-            self.tokens.set_string_index(end);
+            self.tokens.set_string_index(end_index);
 
             // Now the body should be closed.
             let token = self.tokens.next().expect("parse_func_body: expected token");
             assert!(token.v == Token::RightBracket);
 
-            body
+            (FuncCall(body), Some(span))
         } else {
-            parser(args, None, self.ctx)?
-        }))
+            (FuncCall(parser(args, None, self.ctx)?), None)
+        })
     }
 
     /// Parse an expression.
@@ -399,16 +409,30 @@ impl<'s> Parser<'s> {
 }
 
 /// Find the index of the first unbalanced and unescaped closing bracket.
-fn find_closing_bracket(src: &str) -> Option<usize> {
+fn find_closing_bracket(src: &str) -> Option<(usize, Position)> {
     let mut parens = 0;
     let mut escaped = false;
+    let mut line = 1;
+    let mut line_start_index = 0;
+
     for (index, c) in src.char_indices() {
         match c {
             '\\' => {
                 escaped = !escaped;
                 continue;
             }
-            ']' if !escaped && parens == 0 => return Some(index),
+            c if is_newline_char(c) => {
+                line += 1;
+                line_start_index = index + c.len_utf8();
+            }
+            ']' if !escaped && parens == 0 => {
+                let position = Position {
+                    line,
+                    column: index - line_start_index,
+                };
+
+                return Some((index, position))
+            }
             '[' if !escaped => parens += 1,
             ']' if !escaped => parens -= 1,
             _ => {}
@@ -441,9 +465,16 @@ impl<'s> PeekableTokens<'s> {
         *self.peeked.get_or_insert_with(|| iter.next())
     }
 
-    fn string_index(&mut self) -> usize {
+    fn get_position(&self) -> Position {
         match self.peeked {
             Some(Some(peeked)) => peeked.span.start,
+            _ => self.tokens.get_position(),
+        }
+    }
+
+    fn string_index(&self) -> usize {
+        match self.peeked {
+            Some(Some(peeked)) => peeked.span.start.line,
             _ => self.tokens.string_index(),
         }
     }
@@ -577,7 +608,7 @@ mod tests {
     }
 
     fn zerospan<T>(val: T) -> Spanned<T> {
-        Spanned::new(val, Span::new(0, 0))
+        Spanned::new(val, Span::new(Position::new(0, 0), Position::new(0, 0)))
     }
 
     /// Shortcut macro to create a syntax tree. Is `vec`-like and the elements
@@ -751,36 +782,29 @@ mod tests {
     #[test]
     #[rustfmt::skip]
     fn parse_spans() {
-        let mut scope = Scope::new();
-        scope.add::<TreeFn>("hello");
+        fn test_span(src: &str, correct: Vec<(usize, usize, usize, usize)>) {
+            let mut scope = Scope::new();
+            scope.add::<TreeFn>("hello");
+            let tree = parse(src, ParseContext { scope: &scope }).unwrap();
+            let spans = tree.nodes.into_iter()
+                .map(|node| {
+                    let Span { start, end } = node.span;
+                    (start.line, start.column, end.line, end.column)
+                })
+                .collect::<Vec<_>>();
 
-        let parse = |string| {
-            parse(string, ParseContext { scope: &scope }).unwrap().nodes
-        };
+            assert_eq!(spans, correct);
+        }
 
-        let tree = parse("hello world");
-        assert_eq!(tree[0].span.pair(), (0, 5));
-        assert_eq!(tree[2].span.pair(), (6, 11));
-
-        let tree = parse("p1\n \np2");
-        assert_eq!(tree[1].span.pair(), (2, 5));
-
-        let tree = parse("p1\n p2");
-        assert_eq!(tree[1].span.pair(), (2, 4));
-
-        let src = "func [hello: pos, other][body _🌍_]";
-        let tree = parse(src);
-        assert_eq!(tree[0].span.pair(), (0, 4));
-        assert_eq!(tree[1].span.pair(), (4, 5));
-        assert_eq!(tree[2].span.pair(), (5, 37));
-
-        let func = if let Node::Func(f) = &tree[2].v { f } else { panic!() };
-        let body = &func.0.downcast::<TreeFn>().unwrap().tree.nodes;
-        assert_eq!(body[0].span.pair(), (0, 4));
-        assert_eq!(body[1].span.pair(), (4, 5));
-        assert_eq!(body[2].span.pair(), (5, 6));
-        assert_eq!(body[3].span.pair(), (6, 10));
-        assert_eq!(body[4].span.pair(), (10, 11));
+        test_span("hello world", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]);
+        test_span("p1\n \np2", vec![(1, 0, 1, 2), (1, 2, 2, 2), (3, 0, 3, 2)]);
+
+        let src = "func\n [hello: pos, other][body\r\n _🌍_\n]";
+        test_span(src, vec![
+            (1, 0, 1, 4),
+            (1, 4, 2, 1),
+            (2, 1, 4, 1)
+        ]);
     }
 
     /// Tests whether errors get reported correctly.
diff --git a/src/syntax/span.rs b/src/syntax/span.rs
index c12ac513..bc7001a9 100644
--- a/src/syntax/span.rs
+++ b/src/syntax/span.rs
@@ -35,28 +35,26 @@ debug_display!(Spanned; T where T: std::fmt::Debug);
 /// Describes a slice of source code.
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub struct Span {
-    pub start: usize,
-    pub end: usize,
+    pub start: Position,
+    pub end: Position,
 }
 
 impl Span {
-    pub fn new(start: usize, end: usize) -> Span {
+    pub fn new(start: Position, end: Position) -> Span {
         Span { start, end }
     }
 
     pub fn merge(a: Span, b: Span) -> Span {
+        let start = a.start.min(b.start);
+
         Span {
             start: a.start.min(b.start),
             end: a.end.max(b.end),
         }
     }
 
-    pub fn at(index: usize) -> Span {
-        Span { start: index, end: index + 1 }
-    }
-
-    pub fn pair(&self) -> (usize, usize) {
-        (self.start, self.end)
+    pub fn at(pos: Position) -> Span {
+        Span { start: pos, end: pos }
     }
 
     pub fn expand(&mut self, other: Span) {
@@ -71,3 +69,26 @@ impl Display for Span {
 }
 
 debug_display!(Span);
+
+/// A line-column position in source code.
+#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub struct Position {
+    /// The 1-indexed line (inclusive).
+    pub line: usize,
+    /// The 0-indexed column (inclusive).
+    pub column: usize,
+}
+
+impl Position {
+    pub fn new(line: usize, column: usize) -> Position {
+        Position { line, column }
+    }
+}
+
+impl Display for Position {
+    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+        write!(f, "{}:{}", self.line, self.column)
+    }
+}
+
+debug_display!(Position);
diff --git a/src/syntax/tokens.rs b/src/syntax/tokens.rs
index f5854d8f..cf37fe48 100644
--- a/src/syntax/tokens.rs
+++ b/src/syntax/tokens.rs
@@ -18,6 +18,8 @@ pub struct Tokens<'s> {
     chars: PeekableChars<'s>,
     state: TokensState,
     stack: SmallVec<[TokensState; 1]>,
+    line: usize,
+    line_start_index: usize,
 }
 
 /// The state the tokenizer is in.
@@ -40,11 +42,13 @@ impl<'s> Tokens<'s> {
             chars: PeekableChars::new(src),
             state: TokensState::Body,
             stack: SmallVec::new(),
+            line: 1,
+            line_start_index: 0,
         }
     }
 
     /// The index of the first character of the next token in the source string.
-    pub fn string_index(&mut self) -> usize {
+    pub fn string_index(&self) -> usize {
         self.chars.string_index()
     }
 
@@ -53,6 +57,11 @@ impl<'s> Tokens<'s> {
         self.chars.set_string_index(index);
     }
 
+    /// The current position in the source.
+    pub fn get_position(&self) -> Position {
+        self.line_position(self.string_index())
+    }
+
     /// Advance the iterator by one step.
     fn advance(&mut self) {
         self.chars.next();
@@ -68,6 +77,14 @@ impl<'s> Tokens<'s> {
     fn unswitch(&mut self) {
         self.state = self.stack.pop().unwrap_or(TokensState::Body);
     }
+
+    /// The `Position` with line and column for a string index.
+    fn line_position(&self, index: usize) -> Position {
+        Position {
+            line: self.line,
+            column: index - self.line_start_index,
+        }
+    }
 }
 
 impl<'s> Iterator for Tokens<'s> {
@@ -83,7 +100,8 @@ impl<'s> Iterator for Tokens<'s> {
             if let Some((index, '[')) = self.chars.peek() {
                 self.advance();
                 self.state = TS::Body;
-                return Some(Spanned::new(Token::LeftBracket, Span::at(index)));
+                let span = Span::at(self.line_position(index));
+                return Some(Spanned::new(Token::LeftBracket, span));
             } else {
                 self.unswitch();
             }
@@ -93,6 +111,9 @@ impl<'s> Iterator for Tokens<'s> {
         let (pos, next) = self.chars.next()?;
         let afterwards = self.chars.peekc();
 
+        /// The index at which the line ended, if it did.
+        let mut eol = None;
+
         let token = match next {
             // Functions
             '[' => {
@@ -173,9 +194,13 @@ impl<'s> Iterator for Tokens<'s> {
             // Newlines
             '\r' if afterwards == Some('\n') => {
                 self.advance();
+                eol = Some(pos + "\r\n".len());
                 Token::Newline
-            },
-            c if is_newline_char(c) => Token::Newline,
+            }
+            c if is_newline_char(c) => {
+                eol = Some(pos + c.len_utf8());
+                Token::Newline
+            }
 
             // Star/Underscore/Backtick in bodies
             '*' if self.state == TS::Body => Token::Star,
@@ -257,12 +282,21 @@ impl<'s> Iterator for Tokens<'s> {
             }
         };
 
-        Some(Spanned::new(token, Span::new(pos, self.string_index())))
+        let start = self.line_position(pos);
+        let end = self.get_position();
+        let span = Span::new(start, end);
+
+        if let Some(index) = eol {
+            self.line += 1;
+            self.line_start_index = index;
+        }
+
+        Some(Spanned::new(token, span))
     }
 }
 
 /// Whether this character is a newline (or starts one).
-fn is_newline_char(character: char) -> bool {
+pub(crate) fn is_newline_char(character: char) -> bool {
     match character {
         '\n' | '\r' | '\u{000c}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true,
         _ => false,
@@ -316,7 +350,7 @@ impl<'s> PeekableChars<'s> {
         self.chars.next().map(|(i, c)| (self.base + i, c))
     }
 
-    fn string_index(&mut self) -> usize {
+    fn string_index(&self) -> usize {
         self.index
     }
 
@@ -363,9 +397,12 @@ mod tests {
     }
 
     /// Test if the tokens of the source code have the correct spans.
-    fn test_span(src: &str, spans: Vec<(usize, usize)>) {
+    fn test_span(src: &str, spans: Vec<(usize, usize, usize, usize)>) {
         assert_eq!(Tokens::new(src)
-            .map(|token| token.span.pair())
+            .map(|token| {
+                let Span { start, end } = token.span;
+                (start.line, start.column, end.line, end.column)
+            })
             .collect::<Vec<_>>(), spans);
     }
 
@@ -496,8 +533,12 @@ mod tests {
     #[test]
     #[rustfmt::skip]
     fn tokenize_spans() {
-        test_span("Hello World", vec![(0, 5), (5, 6), (6, 11)]);
-        test_span("🌍_🎈", vec![(0, 4), (4, 5), (5, 9)]);
-        test_span("[hello: world]", vec![(0, 1), (1, 6), (6, 7), (7, 8), (8, 13), (13, 14)]);
+        test_span("Hello World", vec![(1, 0, 1, 5), (1, 5, 1, 6), (1, 6, 1, 11)]);
+        test_span("🌍_🎈", vec![(1, 0, 1, 4), (1, 4, 1, 5), (1, 5, 1, 9)]);
+        test_span("hello\nworld", vec![(1, 0, 1, 5), (1, 5, 1, 6), (2, 0, 2, 5)]);
+        test_span("[hello: world]", vec![
+            (1, 0, 1, 1), (1, 1, 1, 6), (1, 6, 1, 7),
+            (1, 7, 1, 8), (1, 8, 1, 13), (1, 13, 1, 14)
+        ]);
     }
 }
author	Laurenz <laurmaedje@gmail.com>	2020-01-10 13:03:43 +0100
committer	Laurenz <laurmaedje@gmail.com>	2020-01-10 13:03:43 +0100
commit	a75ddd2c9356da85b155f5c52fd064c15e6f81b3 (patch)
tree	40d63a65d84945bd2cbf33449096e14e84babdf9 /src/syntax
parent	5dbc7dc5aaaea794b140c5ea7839d681110d7b79 (diff)