diff options
| author | Laurenz <laurmaedje@gmail.com> | 2020-09-30 18:59:33 +0200 |
|---|---|---|
| committer | Laurenz <laurmaedje@gmail.com> | 2020-09-30 18:59:33 +0200 |
| commit | 4077a7c11ea19b1b6b6b6fe3014b9018846cf21b (patch) | |
| tree | 70e4c891c2c660b4136890cebbae7c375fe36c05 /src/parse/tokenizer.rs | |
| parent | 7cc279f7ae122f4c40592004dde89792c636b3c8 (diff) | |
Refactor raw blocks 💱
Diffstat (limited to 'src/parse/tokenizer.rs')
| -rw-r--r-- | src/parse/tokenizer.rs | 121 |
1 files changed, 44 insertions, 77 deletions
diff --git a/src/parse/tokenizer.rs b/src/parse/tokenizer.rs index 92d15edc..720bec43 100644 --- a/src/parse/tokenizer.rs +++ b/src/parse/tokenizer.rs @@ -56,7 +56,7 @@ impl<'s> Tokens<'s> { /// The position in the string at which the last token ends and next token /// will start. pub fn pos(&self) -> Pos { - Pos(self.index as u32) + self.index.into() } } @@ -111,7 +111,7 @@ impl<'s> Iterator for Tokens<'s> { // Style toggles. '_' if self.mode == Body => Underscore, - '`' if self.mode == Body => self.read_raw_or_code(), + '`' if self.mode == Body => self.read_raw(), // Sections. '#' if self.mode == Body => Hashtag, @@ -230,66 +230,31 @@ impl<'s> Tokens<'s> { Str { string, terminated } } - fn read_raw_or_code(&mut self) -> Token<'s> { - let (raw, terminated) = self.read_until_unescaped('`'); - if raw.is_empty() && terminated && self.peek() == Some('`') { - // Third tick found; this is a code block. + fn read_raw(&mut self) -> Token<'s> { + let mut backticks = 1; + while self.peek() == Some('`') { self.eat(); + backticks += 1; + } - // Reads the lang tag (until newline or whitespace). - let start = self.pos(); - let (lang, _) = self.read_string_until(false, 0, 0, |c| { - c == '`' || c.is_whitespace() || is_newline_char(c) - }); - let end = self.pos(); - - let lang = if !lang.is_empty() { - Some(lang.span_with(Span::new(start, end))) - } else { - None - }; - - // Skip to start of raw contents. - while let Some(c) = self.peek() { - if is_newline_char(c) { - self.eat(); - if c == '\r' && self.peek() == Some('\n') { - self.eat(); - } - - break; - } else if c.is_whitespace() { - self.eat(); - } else { - break; - } - } - - let start = self.index; - let mut backticks = 0u32; + let start = self.index; - while backticks < 3 { - match self.eat() { - Some('`') => backticks += 1, - // Escaping of triple backticks. - Some('\\') if backticks == 1 && self.peek() == Some('`') => { - backticks = 0; - } - Some(_) => {} - None => break, - } + let mut found = 0; + while found < backticks { + match self.eat() { + Some('`') => found += 1, + Some(_) => found = 0, + None => break, } + } - let terminated = backticks == 3; - let end = self.index - if terminated { 3 } else { 0 }; + let terminated = found == backticks; + let end = self.index - if terminated { found } else { 0 }; - Code { - lang, - raw: &self.src[start .. end], - terminated, - } - } else { - Raw { raw, terminated } + Raw { + raw: &self.src[start .. end], + backticks, + terminated, } } @@ -469,18 +434,8 @@ mod tests { fn Str(string: &str, terminated: bool) -> Token { Token::Str { string, terminated } } - fn Raw(raw: &str, terminated: bool) -> Token { - Token::Raw { raw, terminated } - } - fn Code<'a>( - lang: Option<Spanned<&'a str>>, - raw: &'a str, - terminated: bool, - ) -> Token<'a> { - Token::Code { lang, raw, terminated } - } - fn Lang<'a, T: Into<Spanned<&'a str>>>(lang: T) -> Option<Spanned<&'a str>> { - Some(Into::<Spanned<&str>>::into(lang)) + fn Raw(raw: &str, backticks: usize, terminated: bool) -> Token { + Token::Raw { raw, backticks, terminated } } fn UE(sequence: &str, terminated: bool) -> Token { Token::UnicodeEscape { sequence, terminated } @@ -535,21 +490,33 @@ mod tests { t!(Body, "***" => Star, Star, Star); t!(Body, "[func]*bold*" => L, T("func"), R, Star, T("bold"), Star); t!(Body, "hi_you_ there" => T("hi"), Underscore, T("you"), Underscore, S(0), T("there")); - t!(Body, "`raw`" => Raw("raw", true)); t!(Body, "# hi" => Hashtag, S(0), T("hi")); t!(Body, "#()" => Hashtag, T("()")); - t!(Body, "`[func]`" => Raw("[func]", true)); - t!(Body, "`]" => Raw("]", false)); - t!(Body, "\\ " => Backslash, S(0)); - t!(Body, "`\\``" => Raw("\\`", true)); - t!(Body, "``not code`" => Raw("", true), T("not"), S(0), T("code"), Raw("", false)); - t!(Body, "```rust hi```" => Code(Lang("rust"), "hi", true)); - t!(Body, "``` hi`\\``" => Code(None, "hi`\\``", false)); - t!(Body, "```js \r\n document.write(\"go\")" => Code(Lang("js"), " document.write(\"go\")", false)); t!(Header, "_`" => Invalid("_`")); } #[test] + fn test_tokenize_raw() { + // Basics. + t!(Body, "`raw`" => Raw("raw", 1, true)); + t!(Body, "`[func]`" => Raw("[func]", 1, true)); + t!(Body, "`]" => Raw("]", 1, false)); + t!(Body, r"`\`` " => Raw(r"\", 1, true), Raw(" ", 1, false)); + + // Language tag. + t!(Body, "``` hi```" => Raw(" hi", 3, true)); + t!(Body, "```rust hi```" => Raw("rust hi", 3, true)); + t!(Body, r"``` hi\````" => Raw(r" hi\", 3, true), Raw("", 1, false)); + t!(Body, "``` not `y`e`t finished```" => Raw(" not `y`e`t finished", 3, true)); + t!(Body, "```js \r\n document.write(\"go\")`" + => Raw("js \r\n document.write(\"go\")`", 3, false)); + + // More backticks. + t!(Body, "`````` ``````hi" => Raw(" ", 6, true), T("hi")); + t!(Body, "````\n```js\nalert()\n```\n````" => Raw("\n```js\nalert()\n```\n", 4, true)); + } + + #[test] fn tokenize_header_only_tokens() { t!(Body, "a: b" => T("a:"), S(0), T("b")); t!(Body, "c=d, " => T("c=d,"), S(0)); |
