diff options
Diffstat (limited to 'src/parse')
| -rw-r--r-- | src/parse/lines.rs | 145 | ||||
| -rw-r--r-- | src/parse/mod.rs | 8 | ||||
| -rw-r--r-- | src/parse/parser.rs | 20 | ||||
| -rw-r--r-- | src/parse/resolve.rs | 17 | ||||
| -rw-r--r-- | src/parse/scanner.rs | 23 | ||||
| -rw-r--r-- | src/parse/tokens.rs | 9 |
6 files changed, 37 insertions, 185 deletions
diff --git a/src/parse/lines.rs b/src/parse/lines.rs deleted file mode 100644 index 2d97a25c..00000000 --- a/src/parse/lines.rs +++ /dev/null @@ -1,145 +0,0 @@ -// FIXME: -// Both `LineMap::location` and `search_column` can lead to quadratic compile -// times for very long lines. We probably need some smart acceleration structure -// to determine columns. - -use super::Scanner; -use crate::syntax::{Location, Pos}; - -/// Enables conversion of byte position to locations. -pub struct LineMap<'s> { - src: &'s str, - line_starts: Vec<Pos>, -} - -impl<'s> LineMap<'s> { - /// Create a new line map for a source string. - pub fn new(src: &'s str) -> Self { - let mut line_starts = vec![Pos::ZERO]; - let mut s = Scanner::new(src); - - while let Some(c) = s.eat_merging_crlf() { - if is_newline(c) { - line_starts.push(s.index().into()); - } - } - - Self { src, line_starts } - } - - /// Convert a byte position to a location. - pub fn location(&self, pos: Pos) -> Option<Location> { - // Find the line which contains the position. - let line_index = match self.line_starts.binary_search(&pos) { - Ok(i) => i, - Err(i) => i - 1, - }; - - let start = self.line_starts.get(line_index)?; - let head = self.src.get(start.to_usize() .. pos.to_usize())?; - - // TODO: What about tabs? - let column_index = head.chars().count(); - - Some(Location { - line: 1 + line_index as u32, - column: 1 + column_index as u32, - }) - } - - /// Convert a location to a byte position. - pub fn pos(&self, location: Location) -> Option<Pos> { - // Determine the boundaries of the line. - let line_idx = location.line.checked_sub(1)? as usize; - let line_start = *self.line_starts.get(line_idx)?; - let line_end = self - .line_starts - .get(location.line as usize) - .map_or(self.src.len(), |pos| pos.to_usize()); - - let line = self.src.get(line_start.to_usize() .. line_end)?; - - // Find the index in the line. For the first column, the index is always - // zero. For other columns, we have to look at which byte the char - // directly before the column in question ends. We can't do - // `nth(column_idx)` directly since the column may be behind the last - // char. - let column_idx = location.column.checked_sub(1)? as usize; - let line_offset = if let Some(prev_idx) = column_idx.checked_sub(1) { - // TODO: What about tabs? - let (idx, prev) = line.char_indices().nth(prev_idx)?; - idx + prev.len_utf8() - } else { - 0 - }; - - Some(line_start + line_offset) - } -} - -/// Count how many column the string would fill. -pub fn count_columns(src: &str) -> usize { - let mut column = 0; - for c in src.chars().rev() { - if is_newline(c) { - break; - } else if c == '\t' { - // TODO: How many columns per tab? - column += 2; - } else { - column += 1; - } - } - column -} - -/// Whether this character denotes a newline. -#[inline] -pub fn is_newline(character: char) -> bool { - matches!( - character, - // Line Feed, Vertical Tab, Form Feed, Carriage Return. - '\n' | '\x0B' | '\x0C' | '\r' | - // Next Line, Line Separator, Paragraph Separator. - '\u{0085}' | '\u{2028}' | '\u{2029}' - ) -} - -#[cfg(test)] -mod tests { - use super::*; - - const TEST: &str = "äbcde\nf💛g\r\nhi\rjkl"; - - #[test] - fn test_line_map_new() { - let map = LineMap::new(TEST); - assert_eq!(map.line_starts, vec![Pos(0), Pos(7), Pos(15), Pos(18)]); - } - - #[test] - fn test_line_map_location() { - let map = LineMap::new(TEST); - assert_eq!(map.location(Pos(0)), Some(Location::new(1, 1))); - assert_eq!(map.location(Pos(2)), Some(Location::new(1, 2))); - assert_eq!(map.location(Pos(6)), Some(Location::new(1, 6))); - assert_eq!(map.location(Pos(7)), Some(Location::new(2, 1))); - assert_eq!(map.location(Pos(8)), Some(Location::new(2, 2))); - assert_eq!(map.location(Pos(12)), Some(Location::new(2, 3))); - assert_eq!(map.location(Pos(21)), Some(Location::new(4, 4))); - assert_eq!(map.location(Pos(22)), None); - } - - #[test] - fn test_line_map_pos() { - fn assert_round_trip(map: &LineMap, pos: Pos) { - assert_eq!(map.location(pos).and_then(|loc| map.pos(loc)), Some(pos)); - } - - let map = LineMap::new(TEST); - assert_round_trip(&map, Pos(0)); - assert_round_trip(&map, Pos(7)); - assert_round_trip(&map, Pos(12)); - assert_round_trip(&map, Pos(21)); - } -} diff --git a/src/parse/mod.rs b/src/parse/mod.rs index c103c342..f033e01f 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,12 +1,10 @@ //! Parsing and tokenization. -mod lines; mod parser; mod resolve; mod scanner; mod tokens; -pub use lines::*; pub use parser::*; pub use resolve::*; pub use scanner::*; @@ -15,13 +13,13 @@ pub use tokens::*; use std::rc::Rc; use crate::diag::TypResult; -use crate::loading::FileId; +use crate::source::SourceFile; use crate::syntax::*; use crate::util::EcoString; /// Parse a string of source code. -pub fn parse(file: FileId, src: &str) -> TypResult<SyntaxTree> { - let mut p = Parser::new(file, src); +pub fn parse(source: &SourceFile) -> TypResult<SyntaxTree> { + let mut p = Parser::new(source); let tree = tree(&mut p); let errors = p.finish(); if errors.is_empty() { diff --git a/src/parse/parser.rs b/src/parse/parser.rs index 0238c8be..6b478780 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -1,15 +1,15 @@ use std::fmt::{self, Debug, Formatter}; use std::ops::Range; -use super::{count_columns, TokenMode, Tokens}; +use super::{TokenMode, Tokens}; use crate::diag::Error; -use crate::loading::FileId; +use crate::source::SourceFile; use crate::syntax::{Pos, Span, Token}; /// A convenient token-based parser. pub struct Parser<'s> { /// The id of the parsed file. - file: FileId, + source: &'s SourceFile, /// Parsing errors. errors: Vec<Error>, /// An iterator over the source tokens. @@ -60,11 +60,11 @@ pub enum Group { impl<'s> Parser<'s> { /// Create a new parser for the source string. - pub fn new(file: FileId, src: &'s str) -> Self { - let mut tokens = Tokens::new(src, TokenMode::Markup); + pub fn new(source: &'s SourceFile) -> Self { + let mut tokens = Tokens::new(source.src(), TokenMode::Markup); let next = tokens.next(); Self { - file, + source, errors: vec![], tokens, groups: vec![], @@ -82,11 +82,7 @@ impl<'s> Parser<'s> { /// Add an error with location and message. pub fn error(&mut self, span: impl Into<Span>, message: impl Into<String>) { - self.errors.push(Error { - file: self.file, - span: span.into(), - message: message.into(), - }); + self.errors.push(Error::new(self.source.file(), span, message)); } /// Eat the next token and add an error that it is not the expected `thing`. @@ -324,7 +320,7 @@ impl<'s> Parser<'s> { /// Determine the column for the given index in the source. pub fn column(&self, index: usize) -> usize { - count_columns(self.tokens.scanner().get(.. index)) + self.source.pos_to_column(index.into()).unwrap() } /// The span from `start` to [`self.prev_end()`](Self::prev_end). diff --git a/src/parse/resolve.rs b/src/parse/resolve.rs index f97d5383..7bd160f9 100644 --- a/src/parse/resolve.rs +++ b/src/parse/resolve.rs @@ -109,8 +109,11 @@ fn split_lines(text: &str) -> Vec<String> { let mut line = String::new(); let mut lines = Vec::new(); - while let Some(c) = s.eat_merging_crlf() { + while let Some(c) = s.eat() { if is_newline(c) { + if c == '\r' { + s.eat_if('\n'); + } lines.push(std::mem::take(&mut line)); } else { line.push(c); @@ -173,14 +176,10 @@ mod tests { text: &str, block: bool, ) { - Span::without_cmp(|| { - assert_eq!(resolve_raw(Span::ZERO, raw, backticks), RawNode { - span: Span::ZERO, - lang: lang.and_then(|id| Ident::new(id, 0)), - text: text.into(), - block, - }); - }); + let node = resolve_raw(Span::ZERO, raw, backticks); + assert_eq!(node.lang.as_deref(), lang); + assert_eq!(node.text, text); + assert_eq!(node.block, block); } // Just one backtick. diff --git a/src/parse/scanner.rs b/src/parse/scanner.rs index 9ee7641c..bb827255 100644 --- a/src/parse/scanner.rs +++ b/src/parse/scanner.rs @@ -47,17 +47,6 @@ impl<'s> Scanner<'s> { debug_assert_eq!(next, Some(c)); } - /// Consume the next char, coalescing `\r\n` to just `\n`. - #[inline] - pub fn eat_merging_crlf(&mut self) -> Option<char> { - if self.rest().starts_with("\r\n") { - self.index += 2; - Some('\n') - } else { - self.eat() - } - } - /// Eat chars while the condition is true. #[inline] pub fn eat_while<F>(&mut self, mut f: F) -> &'s str @@ -168,3 +157,15 @@ impl Debug for Scanner<'_> { write!(f, "Scanner({}|{})", self.eaten(), self.rest()) } } + +/// Whether this character denotes a newline. +#[inline] +pub fn is_newline(character: char) -> bool { + matches!( + character, + // Line Feed, Vertical Tab, Form Feed, Carriage Return. + '\n' | '\x0B' | '\x0C' | '\r' | + // Next Line, Line Separator, Paragraph Separator. + '\u{0085}' | '\u{2028}' | '\u{2029}' + ) +} diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs index 356a2f96..9fd13ecc 100644 --- a/src/parse/tokens.rs +++ b/src/parse/tokens.rs @@ -198,13 +198,16 @@ impl<'s> Tokens<'s> { // Count the number of newlines. let mut newlines = 0; - while let Some(c) = self.s.eat_merging_crlf() { + while let Some(c) = self.s.eat() { if !c.is_whitespace() { self.s.uneat(); break; } if is_newline(c) { + if c == '\r' { + self.s.eat_if('\n'); + } newlines += 1; } } @@ -484,8 +487,8 @@ impl Debug for Tokens<'_> { } } -fn keyword(id: &str) -> Option<Token<'static>> { - Some(match id { +fn keyword(ident: &str) -> Option<Token<'static>> { + Some(match ident { "not" => Token::Not, "and" => Token::And, "or" => Token::Or, |
