summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Allen <dan.j.allen@gmail.com>2014-05-16 03:32:05 -0600
committerDan Allen <dan.j.allen@gmail.com>2014-05-16 03:32:05 -0600
commitcddfc7b0ae26db4b355bae276247fa04766fdc5f (patch)
tree395466ec8d9d4fd1d4087c60841441a6999e84d3
parent686e4237a14f1923818ac956a95b4396bb47156b (diff)
parenta1b17518d6f699ed6cb55f244dc7385a1a4bb845 (diff)
Merge pull request #951 from mojavelinux/issue-892
resolves #892 - match word characters defined by Unicode
-rw-r--r--README.adoc8
-rw-r--r--lib/asciidoctor.rb160
-rw-r--r--lib/asciidoctor/substitutors.rb17
-rw-r--r--test/parser_test.rb67
-rw-r--r--test/sections_test.rb37
-rw-r--r--test/substitutions_test.rb27
6 files changed, 212 insertions, 104 deletions
diff --git a/README.adoc b/README.adoc
index d61be259..388cf0ee 100644
--- a/README.adoc
+++ b/README.adoc
@@ -567,9 +567,9 @@ To run all the tests, simply execute +rake+:
$ rake
-If you want to run a single test file, you can use +testrb+:
+If you want to run a single test file, you can use +ruby+:
- $ testrb test/blocks_test.rb
+ $ ruby test/blocks_test.rb
To test a single test case, first add the string "wip" to the beginning of the description.
For example:
@@ -579,9 +579,9 @@ test 'wip should render ...' do
...
end
-Then, run +testrb+ again, but this time pass a selector argument so it finds matching tests:
+Then, run +ruby+ again, but this time pass a selector argument so it finds matching tests:
- $ testrb test/blocks_test.rb -n /wip/
+ $ ruby test/blocks_test.rb -n /wip/
Once you are done with your test, make sure to remove "wip" from the description and run all the tests again using +rake+.
diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb
index 633c3090..1ea10e0d 100644
--- a/lib/asciidoctor.rb
+++ b/lib/asciidoctor.rb
@@ -2,6 +2,7 @@ RUBY_ENGINE = 'unknown' unless defined? RUBY_ENGINE
RUBY_ENGINE_OPAL = (RUBY_ENGINE == 'opal')
RUBY_ENGINE_JRUBY = (RUBY_ENGINE == 'jruby')
RUBY_MIN_VERSION_1_9 = (RUBY_VERSION >= '1.9')
+RUBY_MIN_VERSION_2 = (RUBY_VERSION >= '2')
require 'set'
@@ -51,6 +52,9 @@ $:.unshift File.dirname __FILE__
#
module Asciidoctor
+ # alias the RUBY_ENGINE constant inside the Asciidoctor namespace
+ RUBY_ENGINE = ::RUBY_ENGINE
+
module SafeMode
# A safe mode level that disables any of the security features enforced
@@ -334,22 +338,46 @@ module Asciidoctor
#(pseudo)module Rx
## Regular expression character classes (to ensure regexp compatibility between Ruby and JavaScript)
+ ## CC stands for "character class", CG stands for "character class group"
+
+ # NOTE \w matches only the ASCII word characters, whereas [[:word:]] or \p{Word} matches any character in the Unicode word category.
- # character classes for JavaScript Regexp engine
- # NOTE use of double quotes are intentional to work around Opal issue
+ # character classes for the Regexp engine(s) in JavaScript
if RUBY_ENGINE == 'opal'
CC_ALPHA = 'a-zA-Z'
+ CG_ALPHA = '[a-zA-Z]'
CC_ALNUM = 'a-zA-Z0-9'
- CC_BLANK = "[ \\t]"
- CC_GRAPH = '[\x21-\x7E]' # non-blank character; broken in Opal!
- CC_EOL = "(?=\\n|$)"
- # character classes for Ruby Regexp engine
+ CG_ALNUM = '[a-zA-Z0-9]'
+ CG_BLANK = '[ \\t]'
+ CC_EOL = '(?=\\n|$)'
+ CG_GRAPH = '[\\x21-\\x7E]' # non-blank character
+ CC_WORD = 'a-zA-Z0-9_'
+ CG_WORD = '[a-zA-Z0-9_]'
+ # character classes for the Regexp engine in Ruby >= 2 (Ruby 1.9 supports \p{} but has problems w/ encoding)
+ elsif ::RUBY_MIN_VERSION_2
+ CC_ALPHA = CG_ALPHA = '\p{Alpha}'
+ CC_ALNUM = CG_ALNUM = '\p{Alnum}'
+ CG_BLANK = '\p{Blank}'
+ CC_EOL = '$'
+ CG_GRAPH = '\p{Graph}'
+ CC_WORD = CG_WORD = '\p{Word}'
+ # character classes for the Regexp engine in Ruby < 2
else
CC_ALPHA = '[:alpha:]'
+ CG_ALPHA = '[[:alpha:]]'
CC_ALNUM = '[:alnum:]'
- CC_BLANK = '[[:blank:]]'
- CC_GRAPH = '[[:graph:]]' # non-blank character
+ CG_ALNUM = '[[:alnum:]]'
+ CG_BLANK = '[[:blank:]]'
CC_EOL = '$'
+ CG_GRAPH = '[[:graph:]]' # non-blank character
+ if ::RUBY_MIN_VERSION_1_9
+ CC_WORD = '[:word:]'
+ CG_WORD = '[[:word:]]'
+ else
+ # NOTE Ruby 1.8 cannot match word characters beyond the ASCII range; if you need this feature, upgrade!
+ CC_WORD = '[:alnum:]_'
+ CG_WORD = '[[:alnum:]_]'
+ end
end
## Document header
@@ -359,8 +387,9 @@ module Asciidoctor
# Examples
#
# Doc Writer <doc@example.com>
+ # Mary_Sue Brontë
#
- AuthorInfoLineRx = /^(\w[\w\-'.]*)(?: +(\w[\w\-'.]*))?(?: +(\w[\w\-'.]*))?(?: +<([^>]+)>)?$/
+ AuthorInfoLineRx = /^(#{CG_WORD}[#{CC_WORD}\-'.]*)(?: +(#{CG_WORD}[#{CC_WORD}\-'.]*))?(?: +(#{CG_WORD}[#{CC_WORD}\-'.]*))?(?: +<([^>]+)>)?$/
# Matches the revision info line, which appears immediately following
# the author info line beneath the document title.
@@ -385,7 +414,7 @@ module Asciidoctor
#
# asciidoctor - converts AsciiDoc source files to HTML, DocBook and other formats
#
- ManpageNamePurposeRx = /^(.*?)#{CC_BLANK}+-#{CC_BLANK}+(.*)$/
+ ManpageNamePurposeRx = /^(.*?)#{CG_BLANK}+-#{CG_BLANK}+(.*)$/
## Preprocessor directives
@@ -409,9 +438,7 @@ module Asciidoctor
#
# "{asciidoctor-version}" >= "0.1.0"
#
- EvalExpressionRx = /^(\S.*?)#{CC_BLANK}*(==|!=|<=|>=|<|>)#{CC_BLANK}*(\S.*)$/
- # ...or if we want to be more strict up front about what's on each side
- # EvalExpressionRx = /^(true|false|("|'|)\{\w+(?:\-\w+)*\}\2|("|')[^\3]*\3|\-?\d+(?:\.\d+)*)#{CC_BLANK}*(==|!=|<=|>=|<|>)#{CC_BLANK}*(true|false|("|'|)\{\w+(?:\-\w+)*\}\6|("|')[^\7]*\7|\-?\d+(?:\.\d+)*)$/
+ EvalExpressionRx = /^(\S.*?)#{CG_BLANK}*(==|!=|<=|>=|<|>)#{CG_BLANK}*(\S.*)$/
# Matches an include preprocessor directive.
#
@@ -437,7 +464,7 @@ module Asciidoctor
# collapsing the line breaks and indentation to
# a single space.
#
- AttributeEntryRx = /^:(!?\w.*?):(?:#{CC_BLANK}+(.*))?$/
+ AttributeEntryRx = /^:(!?\w.*?):(?:#{CG_BLANK}+(.*))?$/
# Matches invalid characters in an attribute name.
InvalidAttributeNameCharsRx = /[^\w\-]/
@@ -470,7 +497,7 @@ module Asciidoctor
# [[idname]]
# [[idname,Reference Text]]
#
- BlockAnchorRx = /^\[\[(?:|([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*))?)\]\]$/
+ BlockAnchorRx = /^\[\[(?:|([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*))?)\]\]$/
# Matches an attribute list above a block element.
#
@@ -485,12 +512,12 @@ module Asciidoctor
# # as attribute reference
# [{lead}]
#
- BlockAttributeListRx = /^\[(|#{CC_BLANK}*[\w\{,.#"'%].*)\]$/
+ BlockAttributeListRx = /^\[(|#{CG_BLANK}*[#{CC_WORD}\{,.#"'%].*)\]$/
# A combined pattern that matches either a block anchor or a block attribute list.
#
# TODO this one gets hit a lot, should be optimized as much as possible
- BlockAttributeLineRx = /^\[(|#{CC_BLANK}*[\w\{,.#"'%].*|\[(?:|[#{CC_ALPHA}:_][\w:.-]*(?:,#{CC_BLANK}*\S.*)?)\])\]$/
+ BlockAttributeLineRx = /^\[(|#{CG_BLANK}*[#{CC_WORD}\{,.#"'%].*|\[(?:|[#{CC_ALPHA}:_][#{CC_WORD}:.-]*(?:,#{CG_BLANK}*\S.*)?)\])\]$/
# Matches a title above a block.
#
@@ -507,7 +534,7 @@ module Asciidoctor
# NOTE: Just a little note.
# TIP: Don't forget!
#
- AdmonitionParagraphRx = /^(#{ADMONITION_STYLES.to_a * '|'}):#{CC_BLANK}/
+ AdmonitionParagraphRx = /^(#{ADMONITION_STYLES.to_a * '|'}):#{CG_BLANK}/
# Matches a literal paragraph, which is a line of text preceded by at least one space.
#
@@ -515,7 +542,7 @@ module Asciidoctor
#
# <SPACE>Foo
# <TAB>Foo
- LiteralParagraphRx = /^(#{CC_BLANK}+.*)$/
+ LiteralParagraphRx = /^(#{CG_BLANK}+.*)$/
# Matches a comment block.
#
@@ -550,11 +577,11 @@ module Asciidoctor
# match[1] is the delimiter, whose length determines the level
# match[2] is the title itself
# match[3] is an inline anchor, which becomes the section id
- AtxSectionRx = /^((?:=|#){1,6})#{CC_BLANK}+(\S.*?)(?:#{CC_BLANK}+\1)?$/
+ AtxSectionRx = /^((?:=|#){1,6})#{CG_BLANK}+(\S.*?)(?:#{CG_BLANK}+\1)?$/
# Matches the restricted section name for a two-line (Setext-style) section title.
# The name cannot begin with a dot and has at least one alphanumeric character.
- SetextSectionTitleRx = /^((?=.*\w+.*)[^.].*?)$/
+ SetextSectionTitleRx = /^((?=.*#{CG_WORD}+.*)[^.].*?)$/
# Matches the underline in a two-line (Setext-style) section title.
#
@@ -571,10 +598,10 @@ module Asciidoctor
# Section Title [[idname]]
# Section Title [[idname,Reference Text]]
#
- InlineSectionAnchorRx = /^(.*?)#{CC_BLANK}+(\\)?\[\[([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*?))?\]\]$/
+ InlineSectionAnchorRx = /^(.*?)#{CG_BLANK}+(\\)?\[\[([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*?))?\]\]$/
# Matches invalid characters in a section id.
- InvalidSectionIdCharsRx = /&(?:[a-zA-Z]{2,}|#\d{2,5}|#x[a-fA-F0-9]{2,4});|\W+?/
+ InvalidSectionIdCharsRx = /&(?:[a-zA-Z]{2,}|#\d{2,5}|#x[a-fA-F0-9]{2,4});|[^#{CC_WORD}]+?/
# Matches the block style used to designate a section title as a floating title.
#
@@ -588,7 +615,7 @@ module Asciidoctor
## Lists
# Detects the start of any list item.
- AnyListRx = /^(?:<?\d+>#{CC_BLANK}+#{CC_GRAPH}|#{CC_BLANK}*(?:-|(?:\*|\.){1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CC_BLANK}+#{CC_GRAPH}|#{CC_BLANK}*.*?(?::{2,4}|;;)(?:#{CC_BLANK}+#{CC_GRAPH}|$))/
+ AnyListRx = /^(?:<?\d+>#{CG_BLANK}+#{CG_GRAPH}|#{CG_BLANK}*(?:-|(?:\*|\.){1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CG_BLANK}+#{CG_GRAPH}|#{CG_BLANK}*.*?(?::{2,4}|;;)(?:#{CG_BLANK}+#{CG_GRAPH}|$))/
# Matches an unordered list item (one level for hyphens, up to 5 levels for asterisks).
#
@@ -597,7 +624,7 @@ module Asciidoctor
# * Foo
# - Foo
#
- UnorderedListRx = /^#{CC_BLANK}*(-|\*{1,5})#{CC_BLANK}+(.*)$/
+ UnorderedListRx = /^#{CG_BLANK}*(-|\*{1,5})#{CG_BLANK}+(.*)$/
# Matches an ordered list item (explicit numbering or up to 5 consecutive dots).
#
@@ -612,7 +639,7 @@ module Asciidoctor
# I. Foo (upperroman)
#
# NOTE leading space match is not always necessary, but is used for list reader
- OrderedListRx = /^#{CC_BLANK}*(\.{1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CC_BLANK}+(.*)$/
+ OrderedListRx = /^#{CG_BLANK}*(\.{1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CG_BLANK}+(.*)$/
# Matches the ordinals for each type of ordered list.
OrderedListMarkerRxMap = {
@@ -649,15 +676,15 @@ module Asciidoctor
# NOTE negative match for comment line is intentional since that isn't handled when looking for next list item
# QUESTION should we check for line comment in regex or when scanning the lines?
#
- DefinitionListRx = /^(?!\/\/)#{CC_BLANK}*(.*?)(:{2,4}|;;)(?:#{CC_BLANK}+(.*))?$/
+ DefinitionListRx = /^(?!\/\/)#{CG_BLANK}*(.*?)(:{2,4}|;;)(?:#{CG_BLANK}+(.*))?$/
# Matches a sibling definition list item (which does not include the keyed type).
DefinitionListSiblingRx = {
# (?:.*?[^:])? - a non-capturing group which grabs longest sequence of characters that doesn't end w/ colon
- '::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(::)(?:#{CC_BLANK}+(.*))?$/,
- ':::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(:::)(?:#{CC_BLANK}+(.*))?$/,
- '::::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(::::)(?:#{CC_BLANK}+(.*))?$/,
- ';;' => /^(?!\/\/)#{CC_BLANK}*(.*)(;;)(?:#{CC_BLANK}+(.*))?$/
+ '::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(::)(?:#{CG_BLANK}+(.*))?$/,
+ ':::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(:::)(?:#{CG_BLANK}+(.*))?$/,
+ '::::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(::::)(?:#{CG_BLANK}+(.*))?$/,
+ ';;' => /^(?!\/\/)#{CG_BLANK}*(.*)(;;)(?:#{CG_BLANK}+(.*))?$/
}
# Matches a callout list item.
@@ -666,7 +693,7 @@ module Asciidoctor
#
# <1> Foo
#
- CalloutListRx = /^<?(\d+)>#{CC_BLANK}+(.*)/
+ CalloutListRx = /^<?(\d+)>#{CG_BLANK}+(.*)/
# Matches a callout reference inside literal text.
#
@@ -706,8 +733,8 @@ module Asciidoctor
# 2.3+<.>m
#
# FIXME use step-wise scan (or treetop) rather than this mega-regexp
- CellSpecStartRx = /^#{CC_BLANK}*(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?\|/
- CellSpecEndRx = /#{CC_BLANK}+(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?$/
+ CellSpecStartRx = /^#{CG_BLANK}*(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?\|/
+ CellSpecEndRx = /#{CG_BLANK}+(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?$/
# Block macros
@@ -719,7 +746,7 @@ module Asciidoctor
#
#--
# NOTE we've relaxed the match for target to accomodate the short format (e.g., name::[attrlist])
- GenericBlockMacroRx = /^(\w[\w\-]*)::(\S*?)\[((?:\\\]|[^\]])*?)\]$/
+ GenericBlockMacroRx = /^(#{CG_WORD}+)::(\S*?)\[((?:\\\]|[^\]])*?)\]$/
# Matches an image, video or audio block macro.
#
@@ -750,7 +777,7 @@ module Asciidoctor
# anchor:idname[]
# anchor:idname[Reference Text]
#
- InlineAnchorRx = /\\?(?:\[\[([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*?))?\]\]|anchor:(\S+)\[(.*?[^\\])?\])/
+ InlineAnchorRx = /\\?(?:\[\[([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*?))?\]\]|anchor:(\S+)\[(.*?[^\\])?\])/
# Matches a bibliography anchor anywhere inline.
#
@@ -758,13 +785,13 @@ module Asciidoctor
#
# [[[Foo]]]
#
- InlineBiblioAnchorRx = /\\?\[\[\[([\w:][\w:.-]*?)\]\]\]/
+ InlineBiblioAnchorRx = /\\?\[\[\[([#{CC_WORD}:][#{CC_WORD}:.-]*?)\]\]\]/
# Matches an inline e-mail address.
#
# doc.writer@example.com
#
- EmailInlineMacroRx = /([\\>:\/])?\w[\w.%+-]*@[#{CC_ALNUM}][#{CC_ALNUM}.-]*\.[#{CC_ALPHA}]{2,4}\b/
+ EmailInlineMacroRx = /([\\>:\/])?#{CG_WORD}[#{CC_WORD}.%+-]*@#{CG_ALNUM}[#{CC_ALNUM}.-]*\.#{CG_ALPHA}{2,4}\b/
# Matches an inline footnote macro, which is allowed to span multiple lines.
#
@@ -816,7 +843,7 @@ module Asciidoctor
# Ctrl + Alt+T
# Ctrl,T
#
- KbdDelimiterRx = /(?:\+|,)(?=#{CC_BLANK}*[^\1])/
+ KbdDelimiterRx = /(?:\+|,)(?=#{CG_BLANK}*[^\1])/
# Matches an implicit link and some of the link inline macro.
#
@@ -855,7 +882,7 @@ module Asciidoctor
# menu:View[Page Style > No Style]
# menu:View[Page Style, No Style]
#
- MenuInlineMacroRx = /\\?menu:(\w|\w.*?\S)\[#{CC_BLANK}*(.+?)?\]/
+ MenuInlineMacroRx = /\\?menu:(#{CG_WORD}|#{CG_WORD}.*?\S)\[#{CG_BLANK}*(.+?)?\]/
# Matches an implicit menu inline macro.
#
@@ -863,7 +890,7 @@ module Asciidoctor
#
# "File > New..."
#
- MenuInlineRx = /\\?"(\w[^"]*?#{CC_BLANK}*&gt;#{CC_BLANK}*[^" \t][^"]*)"/
+ MenuInlineRx = /\\?"(#{CG_WORD}[^"]*?#{CG_BLANK}*&gt;#{CG_BLANK}*[^" \t][^"]*)"/
# Matches a passthrough literal value, which may span multiple lines.
#
@@ -871,7 +898,7 @@ module Asciidoctor
#
# `text`
#
- PassInlineLiteralRx = /(^|[^`\w])(?:\[([^\]]+?)\])?(\\?`([^`\s]|[^`\s].*?\S)`)(?![`\w])/m
+ PassInlineLiteralRx = /(^|[^`#{CC_WORD}])(?:\[([^\]]+?)\])?(\\?`([^`\s]|[^`\s].*?\S)`)(?![`#{CC_WORD}])/m
# Matches several variants of the passthrough inline macro, which may span multiple lines.
#
@@ -891,7 +918,7 @@ module Asciidoctor
# xref:id[reftext]
#
# NOTE special characters have already been escaped, hence the entity references
- XrefInlineMacroRx = /\\?(?:&lt;&lt;([\w":].*?)&gt;&gt;|xref:([\w":].*?)\[(.*?)\])/m
+ XrefInlineMacroRx = /\\?(?:&lt;&lt;([#{CC_WORD}":].*?)&gt;&gt;|xref:([#{CC_WORD}":].*?)\[(.*?)\])/m
## Layout
@@ -904,8 +931,11 @@ module Asciidoctor
# Foo +
#
# NOTE: JavaScript only treats ^ and $ as line boundaries in multiline regexp
- #LineBreakRx = /^(.*)[[:blank:]]\+$/
- LineBreakRx = ::RUBY_ENGINE_OPAL ? %x(/^(.*?)[ \\t]\\+$/m) : %r{^(.*)[[:blank:]]\+$}
+ LineBreakRx = if RUBY_ENGINE == 'opal'
+ /^(.*)[ \t]\+$/m
+ else
+ /^(.*)[[:blank:]]\+$/
+ end
# Matches an AsciiDoc horizontal rule or AsciiDoc page break.
#
@@ -932,7 +962,7 @@ module Asciidoctor
# Matches a blank line.
#
# NOTE allows for empty space in line as it could be left by the template engine
- BlankLineRx = /^#{CC_BLANK}*\n/
+ BlankLineRx = /^#{CG_BLANK}*\n/
# Matches a comma or semi-colon delimiter.
#
@@ -984,7 +1014,7 @@ module Asciidoctor
#
# one\ two\ three
#
- EscapedSpaceRx = /\\(#{CC_BLANK})/
+ EscapedSpaceRx = /\\(#{CG_BLANK})/
# Matches a space delimiter that's not escaped.
#
@@ -992,13 +1022,15 @@ module Asciidoctor
#
# one two three four
#
- SpaceDelimiterRx = /([^\\])#{CC_BLANK}+/
+ SpaceDelimiterRx = /([^\\])#{CG_BLANK}+/
# Matches any character with multibyte support explicitly enabled (length of multibyte char = 1)
#
- # NOTE It's necessary to hide the use of the language modifier (u) from JavaScript
+ # NOTE If necessary to hide use of the language modifier (u) from JavaScript, use (Regexp.new '.', false, 'u')
#
- UnicodeCharScanRx = FORCE_UNICODE_LINE_LENGTH ? (Regexp.new '.', false, 'u') : nil
+ UnicodeCharScanRx = unless RUBY_ENGINE == 'opal'
+ FORCE_UNICODE_LINE_LENGTH ? /./u : nil
+ end
# Detects strings that resemble URIs.
#
@@ -1007,7 +1039,7 @@ module Asciidoctor
# https://domain
# data:info
#
- UriSniffRx = %r{^[#{CC_ALPHA}][#{CC_ALNUM}.+-]*:/{0,2}}
+ UriSniffRx = %r{^#{CG_ALPHA}[#{CC_ALNUM}.+-]*:/{0,2}}
# Detects the end of an implicit URI in the text
#
@@ -1036,12 +1068,12 @@ module Asciidoctor
#
# Here\'s Johnny!
#
- #EscapedSingleQuoteRx = /(\w)\\'(\w)/
+ #EscapedSingleQuoteRx = /(#{CG_WORD})\\'(#{CG_WORD})/
# an alternative if our backend generates single-quoted html/xml attributes
- #EscapedSingleQuoteRx = /(\w|=)\\'(\w)/
+ #EscapedSingleQuoteRx = /(#{CG_WORD}|=)\\'(#{CG_WORD})/
# Matches whitespace at the beginning of the line
- #LeadingSpacesRx = /^(#{CC_BLANK}*)/
+ #LeadingSpacesRx = /^(#{CG_BLANK}*)/
# Matches parent directory references at the beginning of a path
#LeadingParentDirsRx = /^(?:\.\.\/)*/
@@ -1091,34 +1123,34 @@ module Asciidoctor
[:strong, :unconstrained, /\\?(?:\[([^\]]+?)\])?\*\*(.+?)\*\*/m],
# *strong*
- [:strong, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?\*(\S|\S.*?\S)\*(?=\W|$)/m],
+ [:strong, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?\*(\S|\S.*?\S)\*(?!#{CG_WORD})/m],
# ``double-quoted''
- [:double, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?``(\S|\S.*?\S)''(?=\W|$)/m],
+ [:double, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?``(\S|\S.*?\S)''(?!#{CG_WORD})/m],
# 'emphasis'
- [:emphasis, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?'(\S|\S.*?\S)'(?=\W|$)/m],
+ [:emphasis, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?'(\S|\S.*?\S)'(?!#{CG_WORD})/m],
# `single-quoted'
- [:single, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?`(\S|\S.*?\S)'(?=\W|$)/m],
+ [:single, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?`(\S|\S.*?\S)'(?!#{CG_WORD})/m],
# ++monospaced++
[:monospaced, :unconstrained, /\\?(?:\[([^\]]+?)\])?\+\+(.+?)\+\+/m],
# +monospaced+
- [:monospaced, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?\+(\S|\S.*?\S)\+(?=\W|$)/m],
+ [:monospaced, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?\+(\S|\S.*?\S)\+(?!#{CG_WORD})/m],
# __emphasis__
[:emphasis, :unconstrained, /\\?(?:\[([^\]]+?)\])?__(.+?)__/m],
# _emphasis_
- [:emphasis, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?_(\S|\S.*?\S)_(?=\W|$)/m],
+ [:emphasis, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?_(\S|\S.*?\S)_(?!#{CG_WORD})/m],
# ##unquoted##
[:none, :unconstrained, /\\?(?:\[([^\]]+?)\])?##(.+?)##/m],
# #unquoted#
- [:none, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?#(\S|\S.*?\S)#(?=\W|$)/m],
+ [:none, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?#(\S|\S.*?\S)#(?!#{CG_WORD})/m],
# ^superscript^
[:superscript, :unconstrained, /\\?(?:\[([^\]]+?)\])?\^(.+?)\^/m],
@@ -1140,13 +1172,13 @@ module Asciidoctor
# foo -- bar
[/(^|\n| |\\)--( |\n|$)/, '&#8201;&#8212;&#8201;', :none],
# foo--bar
- [/(\w)\\?--(?=\w)/, '&#8212;', :leading],
+ [/(#{CG_WORD})\\?--(?=#{CG_WORD})/, '&#8212;', :leading],
# ellipsis
[/\\?\.\.\./, '&#8230;', :leading],
# apostrophe or a closing single quote (planned)
- [/([#{CC_ALPHA}])\\?'(?!')/, '&#8217;', :leading],
+ [/(#{CG_ALPHA})\\?'(?!')/, '&#8217;', :leading],
# an opening single quote (planned)
- #[/\B\\?'(?=[#{CC_ALPHA}])/, '&#8216;', :none],
+ #[/\B\\?'(?=#{CG_ALPHA})/, '&#8216;', :none],
# right arrow ->
[/\\?-&gt;/, '&#8594;', :none],
# right double arrow =>
diff --git a/lib/asciidoctor/substitutors.rb b/lib/asciidoctor/substitutors.rb
index 677506dd..1685de28 100644
--- a/lib/asciidoctor/substitutors.rb
+++ b/lib/asciidoctor/substitutors.rb
@@ -189,7 +189,7 @@ module Substitutors
text = text.gsub(PassInlineLiteralRx) {
# alias match for Ruby 1.8.7 compat
m = $~
- # fix nil results in Opal
+ # fix non-matching group results in Opal under Firefox
if ::RUBY_ENGINE_OPAL
m[2] = nil if m[2] == ''
end
@@ -197,8 +197,7 @@ module Substitutors
unescaped_attrs = nil
# honor the escape
if m[3].start_with? '\\'
- # NOTE Opal may not like %() as an enclosure around this string
- next m[2] ? "#{m[1]}[#{m[2]}]#{m[3][1..-1]}" : "#{m[1]}#{m[3][1..-1]}"
+ next m[2] ? %(#{m[1]}[#{m[2]}]#{m[3][1..-1]}) : %(#{m[1]}#{m[3][1..-1]})
elsif m[1] == '\\' && m[2]
unescaped_attrs = "[#{m[2]}]"
end
@@ -600,7 +599,7 @@ module Substitutors
next m[0][1..-1]
end
- # fix nil results in Opal
+ # fix non-matching group results in Opal under Firefox
if ::RUBY_ENGINE_OPAL
m[1] = nil if m[1] == ''
end
@@ -653,7 +652,7 @@ module Substitutors
# NOTE Opal doesn't like %() as an enclosure around this string
next "#{m[1]}#{m[2][1..-1]}#{m[3]}"
end
- # fix nil results in Opal
+ # fix non-matching group results in Opal under Firefox
if ::RUBY_ENGINE_OPAL
m[3] = nil if m[3] == ''
end
@@ -873,7 +872,7 @@ module Substitutors
if m[0].start_with? '\\'
next m[0][1..-1]
end
- # fix nil results in Opal
+ # fix non-matching group results in Opal under Firefox
if ::RUBY_ENGINE_OPAL
m[1] = nil if m[1] == ''
m[2] = nil if m[2] == ''
@@ -913,18 +912,18 @@ module Substitutors
if m[0].start_with? '\\'
next m[0][1..-1]
end
- # fix nil results in Opal
+ # fix non-matching group results in Opal under Firefox
if ::RUBY_ENGINE_OPAL
m[1] = nil if m[1] == ''
end
if m[1]
id, reftext = m[1].split(',', 2).map {|it| it.strip }
- id = id.sub(DoubleQuotedRx, ::RUBY_ENGINE_OPAL ? '$2' : '\2')
+ id = id.sub(DoubleQuotedRx, '\2')
# NOTE In Opal, reftext is set to empty string if comma is missing
reftext = if reftext.nil_or_empty?
nil
else
- reftext.sub(DoubleQuotedMultiRx, ::RUBY_ENGINE_OPAL ? '$2' : '\2')
+ reftext.sub(DoubleQuotedMultiRx, '\2')
end
else
id = m[2]
diff --git a/test/parser_test.rb b/test/parser_test.rb
index e5965d82..2045f380 100644
--- a/test/parser_test.rb
+++ b/test/parser_test.rb
@@ -286,7 +286,7 @@ context "Parser" do
end
test "parse author first" do
- metadata, = parse_header_metadata 'Stuart'
+ metadata, _ = parse_header_metadata 'Stuart'
assert_equal 5, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal metadata['author'], metadata['authors']
@@ -295,7 +295,7 @@ context "Parser" do
end
test "parse author first last" do
- metadata, = parse_header_metadata 'Yukihiro Matsumoto'
+ metadata, _ = parse_header_metadata 'Yukihiro Matsumoto'
assert_equal 6, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Yukihiro Matsumoto', metadata['author']
@@ -306,7 +306,7 @@ context "Parser" do
end
test "parse author first middle last" do
- metadata, = parse_header_metadata 'David Heinemeier Hansson'
+ metadata, _ = parse_header_metadata 'David Heinemeier Hansson'
assert_equal 7, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'David Heinemeier Hansson', metadata['author']
@@ -318,7 +318,7 @@ context "Parser" do
end
test "parse author first middle last email" do
- metadata, = parse_header_metadata 'David Heinemeier Hansson <rails@ruby-lang.org>'
+ metadata, _ = parse_header_metadata 'David Heinemeier Hansson <rails@ruby-lang.org>'
assert_equal 8, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'David Heinemeier Hansson', metadata['author']
@@ -331,7 +331,7 @@ context "Parser" do
end
test "parse author first email" do
- metadata, = parse_header_metadata 'Stuart <founder@asciidoc.org>'
+ metadata, _ = parse_header_metadata 'Stuart <founder@asciidoc.org>'
assert_equal 6, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Stuart', metadata['author']
@@ -342,7 +342,7 @@ context "Parser" do
end
test "parse author first last email" do
- metadata, = parse_header_metadata 'Stuart Rackham <founder@asciidoc.org>'
+ metadata, _ = parse_header_metadata 'Stuart Rackham <founder@asciidoc.org>'
assert_equal 7, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Stuart Rackham', metadata['author']
@@ -354,7 +354,7 @@ context "Parser" do
end
test "parse author with hyphen" do
- metadata, = parse_header_metadata 'Tim Berners-Lee <founder@www.org>'
+ metadata, _ = parse_header_metadata 'Tim Berners-Lee <founder@www.org>'
assert_equal 7, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Tim Berners-Lee', metadata['author']
@@ -366,7 +366,7 @@ context "Parser" do
end
test "parse author with single quote" do
- metadata, = parse_header_metadata 'Stephen O\'Grady <founder@redmonk.com>'
+ metadata, _ = parse_header_metadata 'Stephen O\'Grady <founder@redmonk.com>'
assert_equal 7, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Stephen O\'Grady', metadata['author']
@@ -378,7 +378,7 @@ context "Parser" do
end
test "parse author with dotted initial" do
- metadata, = parse_header_metadata 'Heiko W. Rupp <hwr@example.de>'
+ metadata, _ = parse_header_metadata 'Heiko W. Rupp <hwr@example.de>'
assert_equal 8, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Heiko W. Rupp', metadata['author']
@@ -391,7 +391,7 @@ context "Parser" do
end
test "parse author with underscore" do
- metadata, = parse_header_metadata 'Tim_E Fella'
+ metadata, _ = parse_header_metadata 'Tim_E Fella'
assert_equal 6, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Tim E Fella', metadata['author']
@@ -401,8 +401,31 @@ context "Parser" do
assert_equal 'TF', metadata['authorinitials']
end
+ test 'parse author name with letters outside basic latin' do
+ metadata, _ = parse_header_metadata 'Stéphane Brontë'
+ assert_equal 6, metadata.size
+ assert_equal 1, metadata['authorcount']
+ assert_equal 'Stéphane Brontë', metadata['author']
+ assert_equal metadata['author'], metadata['authors']
+ assert_equal 'Stéphane', metadata['firstname']
+ assert_equal 'Brontë', metadata['lastname']
+ assert_equal 'SB', metadata['authorinitials']
+ end if ::RUBY_MIN_VERSION_1_9
+
+ test 'parse ideographic author names' do
+ metadata, _ = parse_header_metadata '李 四 <si.li@example.com>'
+ assert_equal 7, metadata.size
+ assert_equal 1, metadata['authorcount']
+ assert_equal '李 四', metadata['author']
+ assert_equal metadata['author'], metadata['authors']
+ assert_equal '李', metadata['firstname']
+ assert_equal '四', metadata['lastname']
+ assert_equal 'si.li@example.com', metadata['email']
+ assert_equal '李四', metadata['authorinitials']
+ end if ::RUBY_MIN_VERSION_1_9
+
test "parse author condenses whitespace" do
- metadata, = parse_header_metadata ' Stuart Rackham <founder@asciidoc.org>'
+ metadata, _ = parse_header_metadata ' Stuart Rackham <founder@asciidoc.org>'
assert_equal 7, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Stuart Rackham', metadata['author']
@@ -414,7 +437,7 @@ context "Parser" do
end
test "parse invalid author line becomes author" do
- metadata, = parse_header_metadata ' Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>'
+ metadata, _ = parse_header_metadata ' Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>'
assert_equal 5, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>', metadata['author']
@@ -424,7 +447,7 @@ context "Parser" do
end
test 'parse multiple authors' do
- metadata, = parse_header_metadata 'Doc Writer <doc.writer@asciidoc.org>; John Smith <john.smith@asciidoc.org>'
+ metadata, _ = parse_header_metadata 'Doc Writer <doc.writer@asciidoc.org>; John Smith <john.smith@asciidoc.org>'
assert_equal 2, metadata['authorcount']
assert_equal 'Doc Writer, John Smith', metadata['authors']
assert_equal 'Doc Writer', metadata['author']
@@ -437,7 +460,7 @@ context "Parser" do
Ryan Waldron
v0.0.7, 2013-12-18: The first release you can stand on
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 9, metadata.size
assert_equal '0.0.7', metadata['revnumber']
assert_equal '2013-12-18', metadata['revdate']
@@ -449,7 +472,7 @@ v0.0.7, 2013-12-18: The first release you can stand on
Ryan Waldron
2013-12-18
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 7, metadata.size
assert_equal '2013-12-18', metadata['revdate']
end
@@ -460,7 +483,7 @@ Ryan Waldron
Ryan Waldron
foobar
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 7, metadata.size
assert_equal 'foobar', metadata['revdate']
end
@@ -470,7 +493,7 @@ foobar
Ryan Waldron
2013-12-18: The first release you can stand on
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 8, metadata.size
assert_equal '2013-12-18', metadata['revdate']
assert_equal 'The first release you can stand on', metadata['revremark']
@@ -481,7 +504,7 @@ Ryan Waldron
Joe Cool
:page-layout: post
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
refute_equal 'page-layout: post', metadata['revremark']
assert !metadata.has_key?('revdate')
end
@@ -491,7 +514,7 @@ Joe Cool
Joe Cool
:Must start revremark-only line with space
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 'Must start revremark-only line with space', metadata['revremark']
assert_equal '', metadata['revdate']
end
@@ -502,7 +525,7 @@ Joe Cool
// release artist
Ryan Waldron
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 6, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Ryan Waldron', metadata['author']
@@ -519,7 +542,7 @@ release artist
////
Ryan Waldron
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 6, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Ryan Waldron', metadata['author']
@@ -537,7 +560,7 @@ release info
////
v0.0.7, 2013-12-18
EOS
- metadata, = parse_header_metadata input
+ metadata, _ = parse_header_metadata input
assert_equal 8, metadata.size
assert_equal 1, metadata['authorcount']
assert_equal 'Ryan Waldron', metadata['author']
diff --git a/test/sections_test.rb b/test/sections_test.rb
index 82e4c188..6f4adb7e 100644
--- a/test/sections_test.rb
+++ b/test/sections_test.rb
@@ -231,9 +231,13 @@ preamble
assert_xpath "//h2[@id='_my_title'][text() = 'My Title ===']", render_string("== My Title ===")
end
- test "with non-word character" do
+ test "with XML entity" do
assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where#{[8217].pack('U*')}s the love?\"]", render_string("== Where's the love?")
end
+
+ test "with non-word character" do
+ assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where’s the love?\"]", render_string("== Where’s the love?")
+ end
test "with sequential non-word characters" do
assert_xpath "//h2[@id='_what_the_is_this'][text() = 'What the \#@$ is this?']", render_string('== What the #@$ is this?')
@@ -256,8 +260,37 @@ preamble
== Asciidoctor in 中文
EOS
output = render_string input
- assert_xpath '//h2[@id="_asciidoctor_in"][text()="Asciidoctor in 中文"]', output
+ if ::RUBY_MIN_VERSION_1_9
+ assert_xpath '//h2[@id="_asciidoctor_in_中文"][text()="Asciidoctor in 中文"]', output
+ else
+ assert_xpath '//h2[@id="_asciidoctor_in"][text()="Asciidoctor in 中文"]', output
+ end
end
+
+ test 'with only multibyte characters' do
+ input = <<-EOS
+== 视图
+ EOS
+ output = render_embedded_string input
+ assert_xpath '//h2[@id="_视图"][text()="视图"]', output
+ end if ::RUBY_MIN_VERSION_1_9
+
+ test 'multiline syntax with only multibyte characters' do
+ input = <<-EOS
+视图
+--
+
+content
+
+连接器
+---
+
+content
+ EOS
+ output = render_embedded_string input
+ assert_xpath '//h2[@id="_视图"][text()="视图"]', output
+ assert_xpath '//h2[@id="_连接器"][text()="连接器"]', output
+ end if ::RUBY_MIN_VERSION_1_9
end
context "level 2" do
diff --git a/test/substitutions_test.rb b/test/substitutions_test.rb
index 8df8de5b..73907245 100644
--- a/test/substitutions_test.rb
+++ b/test/substitutions_test.rb
@@ -118,6 +118,11 @@ context 'Substitutions' do
assert_equal '<strong>bl*ck</strong>-eye', para.sub_quotes(para.source)
end
+ test 'constrained strong string containing an asterisk and multibyte word chars' do
+ para = block_from_string(%q{*黑*眼圈*})
+ assert_equal '<strong>黑*眼圈</strong>', para.sub_quotes(para.source)
+ end if ::RUBY_MIN_VERSION_1_9
+
test 'single-line constrained quote variation emphasized string' do
para = block_from_string(%q{'a few emphasized words'})
assert_equal '<em>a few emphasized words</em>', para.sub_quotes(para.source)
@@ -985,6 +990,16 @@ EOS
para = block_from_string('<span class="xmltag">&lt;node&gt;</span><span class="classname">r</span>', :attributes => {'experimental' => ''})
assert_equal %q{<span class="xmltag">&lt;node&gt;</span><span class="classname">r</span>}, para.sub_macros(para.source)
end
+
+ test 'should process menu macro with items containing multibyte characters' do
+ para = block_from_string('menu:视图[放大, 重置]', :attributes => {'experimental' => ''})
+ assert_equal %q{<span class="menuseq"><span class="menu">视图</span>&#160;&#9656; <span class="submenu">放大</span>&#160;&#9656; <span class="menuitem">重置</span></span>}, para.sub_macros(para.source)
+ end if ::RUBY_MIN_VERSION_1_9
+
+ test 'should process inline menu with items containing multibyte characters' do
+ para = block_from_string('"视图 &gt; 放大 &gt; 重置"', :attributes => {'experimental' => ''})
+ assert_equal %q{<span class="menuseq"><span class="menu">视图</span>&#160;&#9656; <span class="submenu">放大</span>&#160;&#9656; <span class="menuitem">重置</span></span>}, para.sub_macros(para.source)
+ end if ::RUBY_MIN_VERSION_1_9
end
end
@@ -1193,21 +1208,27 @@ EOS
end
test 'replaces dashes' do
- para = block_from_string %(-- foo foo--bar foo\\--bar foo -- bar foo \\-- bar
+ para = block_from_string %(-- foo foo--bar foo\\--bar foo -- bar foo \\-- bar
stuff in between
-- foo
stuff in between
foo --
stuff in between
foo --)
- expected = %(&#8201;&#8212;&#8201;foo foo&#8212;bar foo--bar foo&#8201;&#8212;&#8201;bar foo -- bar
+ expected = '&#8201;&#8212;&#8201;foo foo&#8212;bar foo--bar foo&#8201;&#8212;&#8201;bar foo -- bar
stuff in between&#8201;&#8212;&#8201;foo
stuff in between
foo&#8201;&#8212;&#8201;stuff in between
-foo&#8201;&#8212;&#8201;)
+foo&#8201;&#8212;&#8201;'
assert_equal expected, para.sub_replacements(para.source)
end
+ test 'replaces dashes between multibyte word characters' do
+ para = block_from_string %(富--巴)
+ expected = '富&#8212;巴'
+ assert_equal expected, para.sub_replacements(para.source)
+ end if ::RUBY_MIN_VERSION_1_9
+
test 'replaces marks' do
para = block_from_string '(C) (R) (TM) \(C) \(R) \(TM)'
assert_equal '&#169; &#174; &#8482; (C) (R) (TM)', para.sub_replacements(para.source)