diff options
| author | Dan Allen <dan.j.allen@gmail.com> | 2014-05-16 03:32:05 -0600 |
|---|---|---|
| committer | Dan Allen <dan.j.allen@gmail.com> | 2014-05-16 03:32:05 -0600 |
| commit | cddfc7b0ae26db4b355bae276247fa04766fdc5f (patch) | |
| tree | 395466ec8d9d4fd1d4087c60841441a6999e84d3 | |
| parent | 686e4237a14f1923818ac956a95b4396bb47156b (diff) | |
| parent | a1b17518d6f699ed6cb55f244dc7385a1a4bb845 (diff) | |
Merge pull request #951 from mojavelinux/issue-892
resolves #892 - match word characters defined by Unicode
| -rw-r--r-- | README.adoc | 8 | ||||
| -rw-r--r-- | lib/asciidoctor.rb | 160 | ||||
| -rw-r--r-- | lib/asciidoctor/substitutors.rb | 17 | ||||
| -rw-r--r-- | test/parser_test.rb | 67 | ||||
| -rw-r--r-- | test/sections_test.rb | 37 | ||||
| -rw-r--r-- | test/substitutions_test.rb | 27 |
6 files changed, 212 insertions, 104 deletions
diff --git a/README.adoc b/README.adoc index d61be259..388cf0ee 100644 --- a/README.adoc +++ b/README.adoc @@ -567,9 +567,9 @@ To run all the tests, simply execute +rake+: $ rake -If you want to run a single test file, you can use +testrb+: +If you want to run a single test file, you can use +ruby+: - $ testrb test/blocks_test.rb + $ ruby test/blocks_test.rb To test a single test case, first add the string "wip" to the beginning of the description. For example: @@ -579,9 +579,9 @@ test 'wip should render ...' do ... end -Then, run +testrb+ again, but this time pass a selector argument so it finds matching tests: +Then, run +ruby+ again, but this time pass a selector argument so it finds matching tests: - $ testrb test/blocks_test.rb -n /wip/ + $ ruby test/blocks_test.rb -n /wip/ Once you are done with your test, make sure to remove "wip" from the description and run all the tests again using +rake+. diff --git a/lib/asciidoctor.rb b/lib/asciidoctor.rb index 633c3090..1ea10e0d 100644 --- a/lib/asciidoctor.rb +++ b/lib/asciidoctor.rb @@ -2,6 +2,7 @@ RUBY_ENGINE = 'unknown' unless defined? RUBY_ENGINE RUBY_ENGINE_OPAL = (RUBY_ENGINE == 'opal') RUBY_ENGINE_JRUBY = (RUBY_ENGINE == 'jruby') RUBY_MIN_VERSION_1_9 = (RUBY_VERSION >= '1.9') +RUBY_MIN_VERSION_2 = (RUBY_VERSION >= '2') require 'set' @@ -51,6 +52,9 @@ $:.unshift File.dirname __FILE__ # module Asciidoctor + # alias the RUBY_ENGINE constant inside the Asciidoctor namespace + RUBY_ENGINE = ::RUBY_ENGINE + module SafeMode # A safe mode level that disables any of the security features enforced @@ -334,22 +338,46 @@ module Asciidoctor #(pseudo)module Rx ## Regular expression character classes (to ensure regexp compatibility between Ruby and JavaScript) + ## CC stands for "character class", CG stands for "character class group" + + # NOTE \w matches only the ASCII word characters, whereas [[:word:]] or \p{Word} matches any character in the Unicode word category. - # character classes for JavaScript Regexp engine - # NOTE use of double quotes are intentional to work around Opal issue + # character classes for the Regexp engine(s) in JavaScript if RUBY_ENGINE == 'opal' CC_ALPHA = 'a-zA-Z' + CG_ALPHA = '[a-zA-Z]' CC_ALNUM = 'a-zA-Z0-9' - CC_BLANK = "[ \\t]" - CC_GRAPH = '[\x21-\x7E]' # non-blank character; broken in Opal! - CC_EOL = "(?=\\n|$)" - # character classes for Ruby Regexp engine + CG_ALNUM = '[a-zA-Z0-9]' + CG_BLANK = '[ \\t]' + CC_EOL = '(?=\\n|$)' + CG_GRAPH = '[\\x21-\\x7E]' # non-blank character + CC_WORD = 'a-zA-Z0-9_' + CG_WORD = '[a-zA-Z0-9_]' + # character classes for the Regexp engine in Ruby >= 2 (Ruby 1.9 supports \p{} but has problems w/ encoding) + elsif ::RUBY_MIN_VERSION_2 + CC_ALPHA = CG_ALPHA = '\p{Alpha}' + CC_ALNUM = CG_ALNUM = '\p{Alnum}' + CG_BLANK = '\p{Blank}' + CC_EOL = '$' + CG_GRAPH = '\p{Graph}' + CC_WORD = CG_WORD = '\p{Word}' + # character classes for the Regexp engine in Ruby < 2 else CC_ALPHA = '[:alpha:]' + CG_ALPHA = '[[:alpha:]]' CC_ALNUM = '[:alnum:]' - CC_BLANK = '[[:blank:]]' - CC_GRAPH = '[[:graph:]]' # non-blank character + CG_ALNUM = '[[:alnum:]]' + CG_BLANK = '[[:blank:]]' CC_EOL = '$' + CG_GRAPH = '[[:graph:]]' # non-blank character + if ::RUBY_MIN_VERSION_1_9 + CC_WORD = '[:word:]' + CG_WORD = '[[:word:]]' + else + # NOTE Ruby 1.8 cannot match word characters beyond the ASCII range; if you need this feature, upgrade! + CC_WORD = '[:alnum:]_' + CG_WORD = '[[:alnum:]_]' + end end ## Document header @@ -359,8 +387,9 @@ module Asciidoctor # Examples # # Doc Writer <doc@example.com> + # Mary_Sue Brontë # - AuthorInfoLineRx = /^(\w[\w\-'.]*)(?: +(\w[\w\-'.]*))?(?: +(\w[\w\-'.]*))?(?: +<([^>]+)>)?$/ + AuthorInfoLineRx = /^(#{CG_WORD}[#{CC_WORD}\-'.]*)(?: +(#{CG_WORD}[#{CC_WORD}\-'.]*))?(?: +(#{CG_WORD}[#{CC_WORD}\-'.]*))?(?: +<([^>]+)>)?$/ # Matches the revision info line, which appears immediately following # the author info line beneath the document title. @@ -385,7 +414,7 @@ module Asciidoctor # # asciidoctor - converts AsciiDoc source files to HTML, DocBook and other formats # - ManpageNamePurposeRx = /^(.*?)#{CC_BLANK}+-#{CC_BLANK}+(.*)$/ + ManpageNamePurposeRx = /^(.*?)#{CG_BLANK}+-#{CG_BLANK}+(.*)$/ ## Preprocessor directives @@ -409,9 +438,7 @@ module Asciidoctor # # "{asciidoctor-version}" >= "0.1.0" # - EvalExpressionRx = /^(\S.*?)#{CC_BLANK}*(==|!=|<=|>=|<|>)#{CC_BLANK}*(\S.*)$/ - # ...or if we want to be more strict up front about what's on each side - # EvalExpressionRx = /^(true|false|("|'|)\{\w+(?:\-\w+)*\}\2|("|')[^\3]*\3|\-?\d+(?:\.\d+)*)#{CC_BLANK}*(==|!=|<=|>=|<|>)#{CC_BLANK}*(true|false|("|'|)\{\w+(?:\-\w+)*\}\6|("|')[^\7]*\7|\-?\d+(?:\.\d+)*)$/ + EvalExpressionRx = /^(\S.*?)#{CG_BLANK}*(==|!=|<=|>=|<|>)#{CG_BLANK}*(\S.*)$/ # Matches an include preprocessor directive. # @@ -437,7 +464,7 @@ module Asciidoctor # collapsing the line breaks and indentation to # a single space. # - AttributeEntryRx = /^:(!?\w.*?):(?:#{CC_BLANK}+(.*))?$/ + AttributeEntryRx = /^:(!?\w.*?):(?:#{CG_BLANK}+(.*))?$/ # Matches invalid characters in an attribute name. InvalidAttributeNameCharsRx = /[^\w\-]/ @@ -470,7 +497,7 @@ module Asciidoctor # [[idname]] # [[idname,Reference Text]] # - BlockAnchorRx = /^\[\[(?:|([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*))?)\]\]$/ + BlockAnchorRx = /^\[\[(?:|([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*))?)\]\]$/ # Matches an attribute list above a block element. # @@ -485,12 +512,12 @@ module Asciidoctor # # as attribute reference # [{lead}] # - BlockAttributeListRx = /^\[(|#{CC_BLANK}*[\w\{,.#"'%].*)\]$/ + BlockAttributeListRx = /^\[(|#{CG_BLANK}*[#{CC_WORD}\{,.#"'%].*)\]$/ # A combined pattern that matches either a block anchor or a block attribute list. # # TODO this one gets hit a lot, should be optimized as much as possible - BlockAttributeLineRx = /^\[(|#{CC_BLANK}*[\w\{,.#"'%].*|\[(?:|[#{CC_ALPHA}:_][\w:.-]*(?:,#{CC_BLANK}*\S.*)?)\])\]$/ + BlockAttributeLineRx = /^\[(|#{CG_BLANK}*[#{CC_WORD}\{,.#"'%].*|\[(?:|[#{CC_ALPHA}:_][#{CC_WORD}:.-]*(?:,#{CG_BLANK}*\S.*)?)\])\]$/ # Matches a title above a block. # @@ -507,7 +534,7 @@ module Asciidoctor # NOTE: Just a little note. # TIP: Don't forget! # - AdmonitionParagraphRx = /^(#{ADMONITION_STYLES.to_a * '|'}):#{CC_BLANK}/ + AdmonitionParagraphRx = /^(#{ADMONITION_STYLES.to_a * '|'}):#{CG_BLANK}/ # Matches a literal paragraph, which is a line of text preceded by at least one space. # @@ -515,7 +542,7 @@ module Asciidoctor # # <SPACE>Foo # <TAB>Foo - LiteralParagraphRx = /^(#{CC_BLANK}+.*)$/ + LiteralParagraphRx = /^(#{CG_BLANK}+.*)$/ # Matches a comment block. # @@ -550,11 +577,11 @@ module Asciidoctor # match[1] is the delimiter, whose length determines the level # match[2] is the title itself # match[3] is an inline anchor, which becomes the section id - AtxSectionRx = /^((?:=|#){1,6})#{CC_BLANK}+(\S.*?)(?:#{CC_BLANK}+\1)?$/ + AtxSectionRx = /^((?:=|#){1,6})#{CG_BLANK}+(\S.*?)(?:#{CG_BLANK}+\1)?$/ # Matches the restricted section name for a two-line (Setext-style) section title. # The name cannot begin with a dot and has at least one alphanumeric character. - SetextSectionTitleRx = /^((?=.*\w+.*)[^.].*?)$/ + SetextSectionTitleRx = /^((?=.*#{CG_WORD}+.*)[^.].*?)$/ # Matches the underline in a two-line (Setext-style) section title. # @@ -571,10 +598,10 @@ module Asciidoctor # Section Title [[idname]] # Section Title [[idname,Reference Text]] # - InlineSectionAnchorRx = /^(.*?)#{CC_BLANK}+(\\)?\[\[([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*?))?\]\]$/ + InlineSectionAnchorRx = /^(.*?)#{CG_BLANK}+(\\)?\[\[([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*?))?\]\]$/ # Matches invalid characters in a section id. - InvalidSectionIdCharsRx = /&(?:[a-zA-Z]{2,}|#\d{2,5}|#x[a-fA-F0-9]{2,4});|\W+?/ + InvalidSectionIdCharsRx = /&(?:[a-zA-Z]{2,}|#\d{2,5}|#x[a-fA-F0-9]{2,4});|[^#{CC_WORD}]+?/ # Matches the block style used to designate a section title as a floating title. # @@ -588,7 +615,7 @@ module Asciidoctor ## Lists # Detects the start of any list item. - AnyListRx = /^(?:<?\d+>#{CC_BLANK}+#{CC_GRAPH}|#{CC_BLANK}*(?:-|(?:\*|\.){1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CC_BLANK}+#{CC_GRAPH}|#{CC_BLANK}*.*?(?::{2,4}|;;)(?:#{CC_BLANK}+#{CC_GRAPH}|$))/ + AnyListRx = /^(?:<?\d+>#{CG_BLANK}+#{CG_GRAPH}|#{CG_BLANK}*(?:-|(?:\*|\.){1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CG_BLANK}+#{CG_GRAPH}|#{CG_BLANK}*.*?(?::{2,4}|;;)(?:#{CG_BLANK}+#{CG_GRAPH}|$))/ # Matches an unordered list item (one level for hyphens, up to 5 levels for asterisks). # @@ -597,7 +624,7 @@ module Asciidoctor # * Foo # - Foo # - UnorderedListRx = /^#{CC_BLANK}*(-|\*{1,5})#{CC_BLANK}+(.*)$/ + UnorderedListRx = /^#{CG_BLANK}*(-|\*{1,5})#{CG_BLANK}+(.*)$/ # Matches an ordered list item (explicit numbering or up to 5 consecutive dots). # @@ -612,7 +639,7 @@ module Asciidoctor # I. Foo (upperroman) # # NOTE leading space match is not always necessary, but is used for list reader - OrderedListRx = /^#{CC_BLANK}*(\.{1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CC_BLANK}+(.*)$/ + OrderedListRx = /^#{CG_BLANK}*(\.{1,5}|\d+\.|[a-zA-Z]\.|[IVXivx]+\))#{CG_BLANK}+(.*)$/ # Matches the ordinals for each type of ordered list. OrderedListMarkerRxMap = { @@ -649,15 +676,15 @@ module Asciidoctor # NOTE negative match for comment line is intentional since that isn't handled when looking for next list item # QUESTION should we check for line comment in regex or when scanning the lines? # - DefinitionListRx = /^(?!\/\/)#{CC_BLANK}*(.*?)(:{2,4}|;;)(?:#{CC_BLANK}+(.*))?$/ + DefinitionListRx = /^(?!\/\/)#{CG_BLANK}*(.*?)(:{2,4}|;;)(?:#{CG_BLANK}+(.*))?$/ # Matches a sibling definition list item (which does not include the keyed type). DefinitionListSiblingRx = { # (?:.*?[^:])? - a non-capturing group which grabs longest sequence of characters that doesn't end w/ colon - '::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(::)(?:#{CC_BLANK}+(.*))?$/, - ':::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(:::)(?:#{CC_BLANK}+(.*))?$/, - '::::' => /^(?!\/\/)#{CC_BLANK}*((?:.*[^:])?)(::::)(?:#{CC_BLANK}+(.*))?$/, - ';;' => /^(?!\/\/)#{CC_BLANK}*(.*)(;;)(?:#{CC_BLANK}+(.*))?$/ + '::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(::)(?:#{CG_BLANK}+(.*))?$/, + ':::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(:::)(?:#{CG_BLANK}+(.*))?$/, + '::::' => /^(?!\/\/)#{CG_BLANK}*((?:.*[^:])?)(::::)(?:#{CG_BLANK}+(.*))?$/, + ';;' => /^(?!\/\/)#{CG_BLANK}*(.*)(;;)(?:#{CG_BLANK}+(.*))?$/ } # Matches a callout list item. @@ -666,7 +693,7 @@ module Asciidoctor # # <1> Foo # - CalloutListRx = /^<?(\d+)>#{CC_BLANK}+(.*)/ + CalloutListRx = /^<?(\d+)>#{CG_BLANK}+(.*)/ # Matches a callout reference inside literal text. # @@ -706,8 +733,8 @@ module Asciidoctor # 2.3+<.>m # # FIXME use step-wise scan (or treetop) rather than this mega-regexp - CellSpecStartRx = /^#{CC_BLANK}*(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?\|/ - CellSpecEndRx = /#{CC_BLANK}+(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?$/ + CellSpecStartRx = /^#{CG_BLANK}*(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?\|/ + CellSpecEndRx = /#{CG_BLANK}+(?:(\d+(?:\.\d*)?|(?:\d*\.)?\d+)([*+]))?([<^>](?:\.[<^>]?)?|(?:[<^>]?\.)?[<^>])?([a-z])?$/ # Block macros @@ -719,7 +746,7 @@ module Asciidoctor # #-- # NOTE we've relaxed the match for target to accomodate the short format (e.g., name::[attrlist]) - GenericBlockMacroRx = /^(\w[\w\-]*)::(\S*?)\[((?:\\\]|[^\]])*?)\]$/ + GenericBlockMacroRx = /^(#{CG_WORD}+)::(\S*?)\[((?:\\\]|[^\]])*?)\]$/ # Matches an image, video or audio block macro. # @@ -750,7 +777,7 @@ module Asciidoctor # anchor:idname[] # anchor:idname[Reference Text] # - InlineAnchorRx = /\\?(?:\[\[([#{CC_ALPHA}:_][\w:.-]*)(?:,#{CC_BLANK}*(\S.*?))?\]\]|anchor:(\S+)\[(.*?[^\\])?\])/ + InlineAnchorRx = /\\?(?:\[\[([#{CC_ALPHA}:_][#{CC_WORD}:.-]*)(?:,#{CG_BLANK}*(\S.*?))?\]\]|anchor:(\S+)\[(.*?[^\\])?\])/ # Matches a bibliography anchor anywhere inline. # @@ -758,13 +785,13 @@ module Asciidoctor # # [[[Foo]]] # - InlineBiblioAnchorRx = /\\?\[\[\[([\w:][\w:.-]*?)\]\]\]/ + InlineBiblioAnchorRx = /\\?\[\[\[([#{CC_WORD}:][#{CC_WORD}:.-]*?)\]\]\]/ # Matches an inline e-mail address. # # doc.writer@example.com # - EmailInlineMacroRx = /([\\>:\/])?\w[\w.%+-]*@[#{CC_ALNUM}][#{CC_ALNUM}.-]*\.[#{CC_ALPHA}]{2,4}\b/ + EmailInlineMacroRx = /([\\>:\/])?#{CG_WORD}[#{CC_WORD}.%+-]*@#{CG_ALNUM}[#{CC_ALNUM}.-]*\.#{CG_ALPHA}{2,4}\b/ # Matches an inline footnote macro, which is allowed to span multiple lines. # @@ -816,7 +843,7 @@ module Asciidoctor # Ctrl + Alt+T # Ctrl,T # - KbdDelimiterRx = /(?:\+|,)(?=#{CC_BLANK}*[^\1])/ + KbdDelimiterRx = /(?:\+|,)(?=#{CG_BLANK}*[^\1])/ # Matches an implicit link and some of the link inline macro. # @@ -855,7 +882,7 @@ module Asciidoctor # menu:View[Page Style > No Style] # menu:View[Page Style, No Style] # - MenuInlineMacroRx = /\\?menu:(\w|\w.*?\S)\[#{CC_BLANK}*(.+?)?\]/ + MenuInlineMacroRx = /\\?menu:(#{CG_WORD}|#{CG_WORD}.*?\S)\[#{CG_BLANK}*(.+?)?\]/ # Matches an implicit menu inline macro. # @@ -863,7 +890,7 @@ module Asciidoctor # # "File > New..." # - MenuInlineRx = /\\?"(\w[^"]*?#{CC_BLANK}*>#{CC_BLANK}*[^" \t][^"]*)"/ + MenuInlineRx = /\\?"(#{CG_WORD}[^"]*?#{CG_BLANK}*>#{CG_BLANK}*[^" \t][^"]*)"/ # Matches a passthrough literal value, which may span multiple lines. # @@ -871,7 +898,7 @@ module Asciidoctor # # `text` # - PassInlineLiteralRx = /(^|[^`\w])(?:\[([^\]]+?)\])?(\\?`([^`\s]|[^`\s].*?\S)`)(?![`\w])/m + PassInlineLiteralRx = /(^|[^`#{CC_WORD}])(?:\[([^\]]+?)\])?(\\?`([^`\s]|[^`\s].*?\S)`)(?![`#{CC_WORD}])/m # Matches several variants of the passthrough inline macro, which may span multiple lines. # @@ -891,7 +918,7 @@ module Asciidoctor # xref:id[reftext] # # NOTE special characters have already been escaped, hence the entity references - XrefInlineMacroRx = /\\?(?:<<([\w":].*?)>>|xref:([\w":].*?)\[(.*?)\])/m + XrefInlineMacroRx = /\\?(?:<<([#{CC_WORD}":].*?)>>|xref:([#{CC_WORD}":].*?)\[(.*?)\])/m ## Layout @@ -904,8 +931,11 @@ module Asciidoctor # Foo + # # NOTE: JavaScript only treats ^ and $ as line boundaries in multiline regexp - #LineBreakRx = /^(.*)[[:blank:]]\+$/ - LineBreakRx = ::RUBY_ENGINE_OPAL ? %x(/^(.*?)[ \\t]\\+$/m) : %r{^(.*)[[:blank:]]\+$} + LineBreakRx = if RUBY_ENGINE == 'opal' + /^(.*)[ \t]\+$/m + else + /^(.*)[[:blank:]]\+$/ + end # Matches an AsciiDoc horizontal rule or AsciiDoc page break. # @@ -932,7 +962,7 @@ module Asciidoctor # Matches a blank line. # # NOTE allows for empty space in line as it could be left by the template engine - BlankLineRx = /^#{CC_BLANK}*\n/ + BlankLineRx = /^#{CG_BLANK}*\n/ # Matches a comma or semi-colon delimiter. # @@ -984,7 +1014,7 @@ module Asciidoctor # # one\ two\ three # - EscapedSpaceRx = /\\(#{CC_BLANK})/ + EscapedSpaceRx = /\\(#{CG_BLANK})/ # Matches a space delimiter that's not escaped. # @@ -992,13 +1022,15 @@ module Asciidoctor # # one two three four # - SpaceDelimiterRx = /([^\\])#{CC_BLANK}+/ + SpaceDelimiterRx = /([^\\])#{CG_BLANK}+/ # Matches any character with multibyte support explicitly enabled (length of multibyte char = 1) # - # NOTE It's necessary to hide the use of the language modifier (u) from JavaScript + # NOTE If necessary to hide use of the language modifier (u) from JavaScript, use (Regexp.new '.', false, 'u') # - UnicodeCharScanRx = FORCE_UNICODE_LINE_LENGTH ? (Regexp.new '.', false, 'u') : nil + UnicodeCharScanRx = unless RUBY_ENGINE == 'opal' + FORCE_UNICODE_LINE_LENGTH ? /./u : nil + end # Detects strings that resemble URIs. # @@ -1007,7 +1039,7 @@ module Asciidoctor # https://domain # data:info # - UriSniffRx = %r{^[#{CC_ALPHA}][#{CC_ALNUM}.+-]*:/{0,2}} + UriSniffRx = %r{^#{CG_ALPHA}[#{CC_ALNUM}.+-]*:/{0,2}} # Detects the end of an implicit URI in the text # @@ -1036,12 +1068,12 @@ module Asciidoctor # # Here\'s Johnny! # - #EscapedSingleQuoteRx = /(\w)\\'(\w)/ + #EscapedSingleQuoteRx = /(#{CG_WORD})\\'(#{CG_WORD})/ # an alternative if our backend generates single-quoted html/xml attributes - #EscapedSingleQuoteRx = /(\w|=)\\'(\w)/ + #EscapedSingleQuoteRx = /(#{CG_WORD}|=)\\'(#{CG_WORD})/ # Matches whitespace at the beginning of the line - #LeadingSpacesRx = /^(#{CC_BLANK}*)/ + #LeadingSpacesRx = /^(#{CG_BLANK}*)/ # Matches parent directory references at the beginning of a path #LeadingParentDirsRx = /^(?:\.\.\/)*/ @@ -1091,34 +1123,34 @@ module Asciidoctor [:strong, :unconstrained, /\\?(?:\[([^\]]+?)\])?\*\*(.+?)\*\*/m], # *strong* - [:strong, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?\*(\S|\S.*?\S)\*(?=\W|$)/m], + [:strong, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?\*(\S|\S.*?\S)\*(?!#{CG_WORD})/m], # ``double-quoted'' - [:double, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?``(\S|\S.*?\S)''(?=\W|$)/m], + [:double, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?``(\S|\S.*?\S)''(?!#{CG_WORD})/m], # 'emphasis' - [:emphasis, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?'(\S|\S.*?\S)'(?=\W|$)/m], + [:emphasis, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?'(\S|\S.*?\S)'(?!#{CG_WORD})/m], # `single-quoted' - [:single, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?`(\S|\S.*?\S)'(?=\W|$)/m], + [:single, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?`(\S|\S.*?\S)'(?!#{CG_WORD})/m], # ++monospaced++ [:monospaced, :unconstrained, /\\?(?:\[([^\]]+?)\])?\+\+(.+?)\+\+/m], # +monospaced+ - [:monospaced, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?\+(\S|\S.*?\S)\+(?=\W|$)/m], + [:monospaced, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?\+(\S|\S.*?\S)\+(?!#{CG_WORD})/m], # __emphasis__ [:emphasis, :unconstrained, /\\?(?:\[([^\]]+?)\])?__(.+?)__/m], # _emphasis_ - [:emphasis, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?_(\S|\S.*?\S)_(?=\W|$)/m], + [:emphasis, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?_(\S|\S.*?\S)_(?!#{CG_WORD})/m], # ##unquoted## [:none, :unconstrained, /\\?(?:\[([^\]]+?)\])?##(.+?)##/m], # #unquoted# - [:none, :constrained, /(^|[^\w;:}])(?:\[([^\]]+?)\])?#(\S|\S.*?\S)#(?=\W|$)/m], + [:none, :constrained, /(^|[^#{CC_WORD};:}])(?:\[([^\]]+?)\])?#(\S|\S.*?\S)#(?!#{CG_WORD})/m], # ^superscript^ [:superscript, :unconstrained, /\\?(?:\[([^\]]+?)\])?\^(.+?)\^/m], @@ -1140,13 +1172,13 @@ module Asciidoctor # foo -- bar [/(^|\n| |\\)--( |\n|$)/, ' — ', :none], # foo--bar - [/(\w)\\?--(?=\w)/, '—', :leading], + [/(#{CG_WORD})\\?--(?=#{CG_WORD})/, '—', :leading], # ellipsis [/\\?\.\.\./, '…', :leading], # apostrophe or a closing single quote (planned) - [/([#{CC_ALPHA}])\\?'(?!')/, '’', :leading], + [/(#{CG_ALPHA})\\?'(?!')/, '’', :leading], # an opening single quote (planned) - #[/\B\\?'(?=[#{CC_ALPHA}])/, '‘', :none], + #[/\B\\?'(?=#{CG_ALPHA})/, '‘', :none], # right arrow -> [/\\?->/, '→', :none], # right double arrow => diff --git a/lib/asciidoctor/substitutors.rb b/lib/asciidoctor/substitutors.rb index 677506dd..1685de28 100644 --- a/lib/asciidoctor/substitutors.rb +++ b/lib/asciidoctor/substitutors.rb @@ -189,7 +189,7 @@ module Substitutors text = text.gsub(PassInlineLiteralRx) { # alias match for Ruby 1.8.7 compat m = $~ - # fix nil results in Opal + # fix non-matching group results in Opal under Firefox if ::RUBY_ENGINE_OPAL m[2] = nil if m[2] == '' end @@ -197,8 +197,7 @@ module Substitutors unescaped_attrs = nil # honor the escape if m[3].start_with? '\\' - # NOTE Opal may not like %() as an enclosure around this string - next m[2] ? "#{m[1]}[#{m[2]}]#{m[3][1..-1]}" : "#{m[1]}#{m[3][1..-1]}" + next m[2] ? %(#{m[1]}[#{m[2]}]#{m[3][1..-1]}) : %(#{m[1]}#{m[3][1..-1]}) elsif m[1] == '\\' && m[2] unescaped_attrs = "[#{m[2]}]" end @@ -600,7 +599,7 @@ module Substitutors next m[0][1..-1] end - # fix nil results in Opal + # fix non-matching group results in Opal under Firefox if ::RUBY_ENGINE_OPAL m[1] = nil if m[1] == '' end @@ -653,7 +652,7 @@ module Substitutors # NOTE Opal doesn't like %() as an enclosure around this string next "#{m[1]}#{m[2][1..-1]}#{m[3]}" end - # fix nil results in Opal + # fix non-matching group results in Opal under Firefox if ::RUBY_ENGINE_OPAL m[3] = nil if m[3] == '' end @@ -873,7 +872,7 @@ module Substitutors if m[0].start_with? '\\' next m[0][1..-1] end - # fix nil results in Opal + # fix non-matching group results in Opal under Firefox if ::RUBY_ENGINE_OPAL m[1] = nil if m[1] == '' m[2] = nil if m[2] == '' @@ -913,18 +912,18 @@ module Substitutors if m[0].start_with? '\\' next m[0][1..-1] end - # fix nil results in Opal + # fix non-matching group results in Opal under Firefox if ::RUBY_ENGINE_OPAL m[1] = nil if m[1] == '' end if m[1] id, reftext = m[1].split(',', 2).map {|it| it.strip } - id = id.sub(DoubleQuotedRx, ::RUBY_ENGINE_OPAL ? '$2' : '\2') + id = id.sub(DoubleQuotedRx, '\2') # NOTE In Opal, reftext is set to empty string if comma is missing reftext = if reftext.nil_or_empty? nil else - reftext.sub(DoubleQuotedMultiRx, ::RUBY_ENGINE_OPAL ? '$2' : '\2') + reftext.sub(DoubleQuotedMultiRx, '\2') end else id = m[2] diff --git a/test/parser_test.rb b/test/parser_test.rb index e5965d82..2045f380 100644 --- a/test/parser_test.rb +++ b/test/parser_test.rb @@ -286,7 +286,7 @@ context "Parser" do end test "parse author first" do - metadata, = parse_header_metadata 'Stuart' + metadata, _ = parse_header_metadata 'Stuart' assert_equal 5, metadata.size assert_equal 1, metadata['authorcount'] assert_equal metadata['author'], metadata['authors'] @@ -295,7 +295,7 @@ context "Parser" do end test "parse author first last" do - metadata, = parse_header_metadata 'Yukihiro Matsumoto' + metadata, _ = parse_header_metadata 'Yukihiro Matsumoto' assert_equal 6, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Yukihiro Matsumoto', metadata['author'] @@ -306,7 +306,7 @@ context "Parser" do end test "parse author first middle last" do - metadata, = parse_header_metadata 'David Heinemeier Hansson' + metadata, _ = parse_header_metadata 'David Heinemeier Hansson' assert_equal 7, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'David Heinemeier Hansson', metadata['author'] @@ -318,7 +318,7 @@ context "Parser" do end test "parse author first middle last email" do - metadata, = parse_header_metadata 'David Heinemeier Hansson <rails@ruby-lang.org>' + metadata, _ = parse_header_metadata 'David Heinemeier Hansson <rails@ruby-lang.org>' assert_equal 8, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'David Heinemeier Hansson', metadata['author'] @@ -331,7 +331,7 @@ context "Parser" do end test "parse author first email" do - metadata, = parse_header_metadata 'Stuart <founder@asciidoc.org>' + metadata, _ = parse_header_metadata 'Stuart <founder@asciidoc.org>' assert_equal 6, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Stuart', metadata['author'] @@ -342,7 +342,7 @@ context "Parser" do end test "parse author first last email" do - metadata, = parse_header_metadata 'Stuart Rackham <founder@asciidoc.org>' + metadata, _ = parse_header_metadata 'Stuart Rackham <founder@asciidoc.org>' assert_equal 7, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Stuart Rackham', metadata['author'] @@ -354,7 +354,7 @@ context "Parser" do end test "parse author with hyphen" do - metadata, = parse_header_metadata 'Tim Berners-Lee <founder@www.org>' + metadata, _ = parse_header_metadata 'Tim Berners-Lee <founder@www.org>' assert_equal 7, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Tim Berners-Lee', metadata['author'] @@ -366,7 +366,7 @@ context "Parser" do end test "parse author with single quote" do - metadata, = parse_header_metadata 'Stephen O\'Grady <founder@redmonk.com>' + metadata, _ = parse_header_metadata 'Stephen O\'Grady <founder@redmonk.com>' assert_equal 7, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Stephen O\'Grady', metadata['author'] @@ -378,7 +378,7 @@ context "Parser" do end test "parse author with dotted initial" do - metadata, = parse_header_metadata 'Heiko W. Rupp <hwr@example.de>' + metadata, _ = parse_header_metadata 'Heiko W. Rupp <hwr@example.de>' assert_equal 8, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Heiko W. Rupp', metadata['author'] @@ -391,7 +391,7 @@ context "Parser" do end test "parse author with underscore" do - metadata, = parse_header_metadata 'Tim_E Fella' + metadata, _ = parse_header_metadata 'Tim_E Fella' assert_equal 6, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Tim E Fella', metadata['author'] @@ -401,8 +401,31 @@ context "Parser" do assert_equal 'TF', metadata['authorinitials'] end + test 'parse author name with letters outside basic latin' do + metadata, _ = parse_header_metadata 'Stéphane Brontë' + assert_equal 6, metadata.size + assert_equal 1, metadata['authorcount'] + assert_equal 'Stéphane Brontë', metadata['author'] + assert_equal metadata['author'], metadata['authors'] + assert_equal 'Stéphane', metadata['firstname'] + assert_equal 'Brontë', metadata['lastname'] + assert_equal 'SB', metadata['authorinitials'] + end if ::RUBY_MIN_VERSION_1_9 + + test 'parse ideographic author names' do + metadata, _ = parse_header_metadata '李 四 <si.li@example.com>' + assert_equal 7, metadata.size + assert_equal 1, metadata['authorcount'] + assert_equal '李 四', metadata['author'] + assert_equal metadata['author'], metadata['authors'] + assert_equal '李', metadata['firstname'] + assert_equal '四', metadata['lastname'] + assert_equal 'si.li@example.com', metadata['email'] + assert_equal '李四', metadata['authorinitials'] + end if ::RUBY_MIN_VERSION_1_9 + test "parse author condenses whitespace" do - metadata, = parse_header_metadata ' Stuart Rackham <founder@asciidoc.org>' + metadata, _ = parse_header_metadata ' Stuart Rackham <founder@asciidoc.org>' assert_equal 7, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Stuart Rackham', metadata['author'] @@ -414,7 +437,7 @@ context "Parser" do end test "parse invalid author line becomes author" do - metadata, = parse_header_metadata ' Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>' + metadata, _ = parse_header_metadata ' Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>' assert_equal 5, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Stuart Rackham, founder of AsciiDoc <founder@asciidoc.org>', metadata['author'] @@ -424,7 +447,7 @@ context "Parser" do end test 'parse multiple authors' do - metadata, = parse_header_metadata 'Doc Writer <doc.writer@asciidoc.org>; John Smith <john.smith@asciidoc.org>' + metadata, _ = parse_header_metadata 'Doc Writer <doc.writer@asciidoc.org>; John Smith <john.smith@asciidoc.org>' assert_equal 2, metadata['authorcount'] assert_equal 'Doc Writer, John Smith', metadata['authors'] assert_equal 'Doc Writer', metadata['author'] @@ -437,7 +460,7 @@ context "Parser" do Ryan Waldron v0.0.7, 2013-12-18: The first release you can stand on EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 9, metadata.size assert_equal '0.0.7', metadata['revnumber'] assert_equal '2013-12-18', metadata['revdate'] @@ -449,7 +472,7 @@ v0.0.7, 2013-12-18: The first release you can stand on Ryan Waldron 2013-12-18 EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 7, metadata.size assert_equal '2013-12-18', metadata['revdate'] end @@ -460,7 +483,7 @@ Ryan Waldron Ryan Waldron foobar EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 7, metadata.size assert_equal 'foobar', metadata['revdate'] end @@ -470,7 +493,7 @@ foobar Ryan Waldron 2013-12-18: The first release you can stand on EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 8, metadata.size assert_equal '2013-12-18', metadata['revdate'] assert_equal 'The first release you can stand on', metadata['revremark'] @@ -481,7 +504,7 @@ Ryan Waldron Joe Cool :page-layout: post EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input refute_equal 'page-layout: post', metadata['revremark'] assert !metadata.has_key?('revdate') end @@ -491,7 +514,7 @@ Joe Cool Joe Cool :Must start revremark-only line with space EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 'Must start revremark-only line with space', metadata['revremark'] assert_equal '', metadata['revdate'] end @@ -502,7 +525,7 @@ Joe Cool // release artist Ryan Waldron EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 6, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Ryan Waldron', metadata['author'] @@ -519,7 +542,7 @@ release artist //// Ryan Waldron EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 6, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Ryan Waldron', metadata['author'] @@ -537,7 +560,7 @@ release info //// v0.0.7, 2013-12-18 EOS - metadata, = parse_header_metadata input + metadata, _ = parse_header_metadata input assert_equal 8, metadata.size assert_equal 1, metadata['authorcount'] assert_equal 'Ryan Waldron', metadata['author'] diff --git a/test/sections_test.rb b/test/sections_test.rb index 82e4c188..6f4adb7e 100644 --- a/test/sections_test.rb +++ b/test/sections_test.rb @@ -231,9 +231,13 @@ preamble assert_xpath "//h2[@id='_my_title'][text() = 'My Title ===']", render_string("== My Title ===") end - test "with non-word character" do + test "with XML entity" do assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where#{[8217].pack('U*')}s the love?\"]", render_string("== Where's the love?") end + + test "with non-word character" do + assert_xpath "//h2[@id='_where_s_the_love'][text() = \"Where’s the love?\"]", render_string("== Where’s the love?") + end test "with sequential non-word characters" do assert_xpath "//h2[@id='_what_the_is_this'][text() = 'What the \#@$ is this?']", render_string('== What the #@$ is this?') @@ -256,8 +260,37 @@ preamble == Asciidoctor in 中文 EOS output = render_string input - assert_xpath '//h2[@id="_asciidoctor_in"][text()="Asciidoctor in 中文"]', output + if ::RUBY_MIN_VERSION_1_9 + assert_xpath '//h2[@id="_asciidoctor_in_中文"][text()="Asciidoctor in 中文"]', output + else + assert_xpath '//h2[@id="_asciidoctor_in"][text()="Asciidoctor in 中文"]', output + end end + + test 'with only multibyte characters' do + input = <<-EOS +== 视图 + EOS + output = render_embedded_string input + assert_xpath '//h2[@id="_视图"][text()="视图"]', output + end if ::RUBY_MIN_VERSION_1_9 + + test 'multiline syntax with only multibyte characters' do + input = <<-EOS +视图 +-- + +content + +连接器 +--- + +content + EOS + output = render_embedded_string input + assert_xpath '//h2[@id="_视图"][text()="视图"]', output + assert_xpath '//h2[@id="_连接器"][text()="连接器"]', output + end if ::RUBY_MIN_VERSION_1_9 end context "level 2" do diff --git a/test/substitutions_test.rb b/test/substitutions_test.rb index 8df8de5b..73907245 100644 --- a/test/substitutions_test.rb +++ b/test/substitutions_test.rb @@ -118,6 +118,11 @@ context 'Substitutions' do assert_equal '<strong>bl*ck</strong>-eye', para.sub_quotes(para.source) end + test 'constrained strong string containing an asterisk and multibyte word chars' do + para = block_from_string(%q{*黑*眼圈*}) + assert_equal '<strong>黑*眼圈</strong>', para.sub_quotes(para.source) + end if ::RUBY_MIN_VERSION_1_9 + test 'single-line constrained quote variation emphasized string' do para = block_from_string(%q{'a few emphasized words'}) assert_equal '<em>a few emphasized words</em>', para.sub_quotes(para.source) @@ -985,6 +990,16 @@ EOS para = block_from_string('<span class="xmltag"><node></span><span class="classname">r</span>', :attributes => {'experimental' => ''}) assert_equal %q{<span class="xmltag"><node></span><span class="classname">r</span>}, para.sub_macros(para.source) end + + test 'should process menu macro with items containing multibyte characters' do + para = block_from_string('menu:视图[放大, 重置]', :attributes => {'experimental' => ''}) + assert_equal %q{<span class="menuseq"><span class="menu">视图</span> ▸ <span class="submenu">放大</span> ▸ <span class="menuitem">重置</span></span>}, para.sub_macros(para.source) + end if ::RUBY_MIN_VERSION_1_9 + + test 'should process inline menu with items containing multibyte characters' do + para = block_from_string('"视图 > 放大 > 重置"', :attributes => {'experimental' => ''}) + assert_equal %q{<span class="menuseq"><span class="menu">视图</span> ▸ <span class="submenu">放大</span> ▸ <span class="menuitem">重置</span></span>}, para.sub_macros(para.source) + end if ::RUBY_MIN_VERSION_1_9 end end @@ -1193,21 +1208,27 @@ EOS end test 'replaces dashes' do - para = block_from_string %(-- foo foo--bar foo\\--bar foo -- bar foo \\-- bar + para = block_from_string %(-- foo foo--bar foo\\--bar foo -- bar foo \\-- bar stuff in between -- foo stuff in between foo -- stuff in between foo --) - expected = %( — foo foo—bar foo--bar foo — bar foo -- bar + expected = ' — foo foo—bar foo--bar foo — bar foo -- bar stuff in between — foo stuff in between foo — stuff in between -foo — ) +foo — ' assert_equal expected, para.sub_replacements(para.source) end + test 'replaces dashes between multibyte word characters' do + para = block_from_string %(富--巴) + expected = '富—巴' + assert_equal expected, para.sub_replacements(para.source) + end if ::RUBY_MIN_VERSION_1_9 + test 'replaces marks' do para = block_from_string '(C) (R) (TM) \(C) \(R) \(TM)' assert_equal '© ® ™ (C) (R) (TM)', para.sub_replacements(para.source) |
